Discovery Neo Oclib
This commit is contained in:
@@ -1,362 +0,0 @@
|
||||
package stream
|
||||
|
||||
// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
|
||||
//
|
||||
// When a stream write fails because the remote peer is unreachable, the request
|
||||
// is saved here and retried on the next tick. Two levels are defined:
|
||||
//
|
||||
// - dntCritical : retry indefinitely (create / update / delete resource).
|
||||
// - dntModerate : up to dntMaxModerateRetries retries, then abandon.
|
||||
//
|
||||
// Pubsub messages and search streams are explicitly excluded.
|
||||
// Streams initiated from the indexer side are never enqueued here.
|
||||
//
|
||||
// # Crash-resilient persistence
|
||||
//
|
||||
// Critical entries are written to an encrypted file (AES-256-GCM) so they
|
||||
// survive a node crash/restart. The AES key is derived deterministically from
|
||||
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
|
||||
// Moderate entries are intentionally not persisted: their retry budget is small
|
||||
// enough that re-loading them after a restart would be misleading.
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"golang.org/x/crypto/hkdf"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
type dntLevel int
|
||||
|
||||
const (
|
||||
dntCritical dntLevel = iota // retry until the message is delivered
|
||||
dntModerate // retry up to dntMaxModerateRetries times
|
||||
)
|
||||
|
||||
const dntMaxModerateRetries = 3
|
||||
const dntRetryInterval = 15 * time.Second
|
||||
|
||||
// dntProtocols maps each stream protocol to its DNT level.
|
||||
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
|
||||
var dntProtocols = map[protocol.ID]dntLevel{
|
||||
// Critical — data mutations that must eventually be delivered.
|
||||
ProtocolCreateResource: dntCritical,
|
||||
ProtocolUpdateResource: dntCritical,
|
||||
ProtocolDeleteResource: dntCritical,
|
||||
// Moderate — confirmations / config / planner: 3 retries before abandon.
|
||||
ProtocolVerifyResource: dntModerate,
|
||||
ProtocolSendPlanner: dntModerate,
|
||||
ProtocolConsidersResource: dntModerate,
|
||||
ProtocolMinioConfigResource: dntModerate,
|
||||
ProtocolAdmiraltyConfigResource: dntModerate,
|
||||
}
|
||||
|
||||
// dntEntryJSON is the on-disk representation of a dntEntry.
|
||||
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
|
||||
type dntEntryJSON struct {
|
||||
DID string `json:"did"`
|
||||
Addr pp.AddrInfo `json:"addr"`
|
||||
DT *tools.DataType `json:"dt,omitempty"`
|
||||
User string `json:"user"`
|
||||
Payload []byte `json:"payload"`
|
||||
Proto protocol.ID `json:"proto"`
|
||||
Retries int `json:"retries"`
|
||||
AddedAt time.Time `json:"added_at"`
|
||||
}
|
||||
|
||||
type dntEntry struct {
|
||||
did string
|
||||
addr pp.AddrInfo
|
||||
dt *tools.DataType
|
||||
user string
|
||||
payload []byte
|
||||
proto protocol.ID
|
||||
retries int
|
||||
addedAt time.Time
|
||||
}
|
||||
|
||||
func (e *dntEntry) toJSON() dntEntryJSON {
|
||||
return dntEntryJSON{
|
||||
DID: e.did,
|
||||
Addr: e.addr,
|
||||
DT: e.dt,
|
||||
User: e.user,
|
||||
Payload: e.payload,
|
||||
Proto: e.proto,
|
||||
Retries: e.retries,
|
||||
AddedAt: e.addedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func entryFromJSON(j dntEntryJSON) *dntEntry {
|
||||
return &dntEntry{
|
||||
did: j.DID,
|
||||
addr: j.Addr,
|
||||
dt: j.DT,
|
||||
user: j.User,
|
||||
payload: j.Payload,
|
||||
proto: j.Proto,
|
||||
retries: j.Retries,
|
||||
addedAt: j.AddedAt,
|
||||
}
|
||||
}
|
||||
|
||||
type dntCache struct {
|
||||
mu sync.Mutex
|
||||
entries []*dntEntry
|
||||
// aesKey is the derived AES-256 key used for on-disk encryption.
|
||||
// Nil when key derivation failed: persistence is disabled but the in-memory
|
||||
// cache continues to function normally.
|
||||
aesKey []byte
|
||||
}
|
||||
|
||||
// newDNTCache initialises the cache, derives the encryption key, and restores
|
||||
// any critical entries that were persisted before the last crash.
|
||||
func newDNTCache() *dntCache {
|
||||
log := oclib.GetLogger()
|
||||
c := &dntCache{}
|
||||
key, err := deriveDNTKey()
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
|
||||
} else {
|
||||
c.aesKey = key
|
||||
c.loadFromDisk()
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// enqueue adds an entry to the cache and persists critical entries to disk.
|
||||
func (c *dntCache) enqueue(e *dntEntry) {
|
||||
c.mu.Lock()
|
||||
c.entries = append(c.entries, e)
|
||||
c.mu.Unlock()
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
go c.persistToDisk()
|
||||
}
|
||||
}
|
||||
|
||||
// drain atomically removes and returns all current entries.
|
||||
func (c *dntCache) drain() []*dntEntry {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := c.entries
|
||||
c.entries = nil
|
||||
return out
|
||||
}
|
||||
|
||||
// requeue puts entries back at the head of the list, preserving any new
|
||||
// entries added while the retry loop was running.
|
||||
func (c *dntCache) requeue(entries []*dntEntry) {
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.entries = append(entries, c.entries...)
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
// dntCachePath returns the path of the on-disk cache file, placed next to the
|
||||
// node's private key so it lives on the same persistent volume.
|
||||
func dntCachePath() string {
|
||||
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
|
||||
}
|
||||
|
||||
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
|
||||
// using HKDF-SHA256. The derivation is deterministic: the same key is always
|
||||
// produced from the same private key, so no symmetric secret needs storing.
|
||||
func deriveDNTKey() ([]byte, error) {
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
|
||||
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
|
||||
raw, err := priv.Raw()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
|
||||
key := make([]byte, 32)
|
||||
if _, err := io.ReadFull(reader, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
// persistToDisk encrypts all current critical entries and writes them to disk.
|
||||
// Non-critical entries are deliberately excluded — they are not worth restoring
|
||||
// after a restart given their limited retry budget.
|
||||
func (c *dntCache) persistToDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
c.mu.Lock()
|
||||
var toSave []dntEntryJSON
|
||||
for _, e := range c.entries {
|
||||
if dntProtocols[e.proto] == dntCritical {
|
||||
toSave = append(toSave, e.toJSON())
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
plaintext, err := json.Marshal(toSave)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
nonce := make([]byte, gcm.NonceSize())
|
||||
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
|
||||
return
|
||||
}
|
||||
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
|
||||
path := dntCachePath()
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
|
||||
_ = os.Remove(tmp)
|
||||
}
|
||||
}
|
||||
|
||||
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
|
||||
// Errors (missing file, decryption failure) are non-fatal: the cache simply
|
||||
// starts empty, which is safe.
|
||||
func (c *dntCache) loadFromDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
path := dntCachePath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(data) < gcm.NonceSize() {
|
||||
log.Warn().Msg("[dnt] cache file too short, ignoring")
|
||||
return
|
||||
}
|
||||
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
var saved []dntEntryJSON
|
||||
if err := json.Unmarshal(plaintext, &saved); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
for _, j := range saved {
|
||||
// Only restore critical entries — moderate entries are intentionally
|
||||
// not persisted, but this guard defends against format changes.
|
||||
if dntProtocols[j.Proto] != dntCritical {
|
||||
continue
|
||||
}
|
||||
c.entries = append(c.entries, entryFromJSON(j))
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Retry loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
// startDNTLoop runs the background retry goroutine. Call once after init.
|
||||
func (s *StreamService) startDNTLoop() {
|
||||
logger := oclib.GetLogger()
|
||||
ticker := time.NewTicker(dntRetryInterval)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
entries := s.dnt.drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
var keep []*dntEntry
|
||||
for _, e := range entries {
|
||||
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
|
||||
if err == nil {
|
||||
level := dntProtocols[e.proto]
|
||||
if level == dntCritical {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Msg("[dnt] critical message delivered after retry")
|
||||
} else {
|
||||
logger.Info().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message delivered after retry")
|
||||
}
|
||||
continue
|
||||
}
|
||||
level := dntProtocols[e.proto]
|
||||
switch level {
|
||||
case dntCritical:
|
||||
keep = append(keep, e)
|
||||
case dntModerate:
|
||||
e.retries++
|
||||
if e.retries < dntMaxModerateRetries {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
logger.Warn().
|
||||
Str("proto", string(e.proto)).
|
||||
Str("peer", e.did).
|
||||
Int("retries", e.retries).
|
||||
Msg("[dnt] moderate message abandoned after max retries")
|
||||
}
|
||||
}
|
||||
}
|
||||
s.dnt.requeue(keep)
|
||||
// Persist after each tick so the on-disk file reflects the current
|
||||
// state (entries delivered are removed, new ones from concurrent
|
||||
// enqueues are included).
|
||||
go s.dnt.persistToDisk()
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,446 @@
|
||||
package stream
|
||||
|
||||
// DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
|
||||
//
|
||||
// When a stream write fails because the remote peer is unreachable, the request
|
||||
// is saved here and retried on the next tick. Two levels are defined:
|
||||
//
|
||||
// - DTNCritical : retry indefinitely (create / update / delete resource).
|
||||
// - DTNModerate : up to DTNMaxModerateRetries retries, then abandon.
|
||||
//
|
||||
// Pubsub messages and search streams are explicitly excluded.
|
||||
// Streams initiated from the indexer side are never enqueued here.
|
||||
//
|
||||
// # Crash-resilient persistence
|
||||
//
|
||||
// Critical entries are written to an encrypted file (AES-256-GCM) so they
|
||||
// survive a node crash/restart. The AES key is derived deterministically from
|
||||
// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
|
||||
// Moderate entries are intentionally not persisted: their retry budget is small
|
||||
// enough that re-loading them after a restart would be misleading.
|
||||
|
||||
import (
|
||||
"crypto/aes"
|
||||
"crypto/cipher"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
oclib "cloud.o-forge.io/core/oc-lib"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"golang.org/x/crypto/hkdf"
|
||||
|
||||
"oc-discovery/conf"
|
||||
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
"github.com/libp2p/go-libp2p/core/protocol"
|
||||
)
|
||||
|
||||
type DTNLevel int
|
||||
|
||||
const (
|
||||
DTNCritical DTNLevel = iota // retry until the message is delivered
|
||||
DTNModerate // retry up to DTNMaxModerateRetries times
|
||||
)
|
||||
|
||||
const DTNMaxModerateRetries = 3
|
||||
const DTNRetryInterval = 15 * time.Second
|
||||
|
||||
// DTNProtocols maps each stream protocol to its DTN level.
|
||||
// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
|
||||
var DTNProtocols = map[protocol.ID]DTNLevel{
|
||||
// Critical — data mutations that must eventually be delivered.
|
||||
ProtocolCreateResource: DTNCritical,
|
||||
ProtocolUpdateResource: DTNCritical,
|
||||
ProtocolDeleteResource: DTNCritical,
|
||||
// Moderate — confirmations / config / planner: 3 retries before abandon.
|
||||
ProtocolVerifyResource: DTNModerate,
|
||||
ProtocolSendPlanner: DTNModerate,
|
||||
ProtocolConsidersResource: DTNModerate,
|
||||
ProtocolMinioConfigResource: DTNModerate,
|
||||
ProtocolAdmiraltyConfigResource: DTNModerate,
|
||||
ProtocolSourcePresignResource: DTNModerate,
|
||||
}
|
||||
|
||||
// DTNEntryJSON is the on-disk representation of a DTNEntry.
|
||||
// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
|
||||
type DTNEntryJSON struct {
|
||||
DID string `json:"did"`
|
||||
ResourceID string `json:"resource_id,omitempty"`
|
||||
ForceCritical bool `json:"force_critical,omitempty"`
|
||||
Addr pp.AddrInfo `json:"addr"`
|
||||
DT *tools.DataType `json:"dt,omitempty"`
|
||||
User string `json:"user"`
|
||||
Payload []byte `json:"payload"`
|
||||
Proto protocol.ID `json:"proto"`
|
||||
Retries int `json:"retries"`
|
||||
AddedAt time.Time `json:"added_at"`
|
||||
}
|
||||
|
||||
type DTNEntry struct {
|
||||
did string
|
||||
resourceID string // UUID of the resource; empty for non-resource payloads (planner, config)
|
||||
forceCritical bool // true when destination is NANO: all protocols become critical
|
||||
addr pp.AddrInfo
|
||||
dt *tools.DataType
|
||||
user string
|
||||
payload []byte
|
||||
proto protocol.ID
|
||||
retries int
|
||||
addedAt time.Time
|
||||
}
|
||||
|
||||
// isEffectivelyCritical returns true when the entry must be retried indefinitely,
|
||||
// either because its protocol is inherently critical or because the destination
|
||||
// is a NANO peer (forceCritical).
|
||||
func (e *DTNEntry) isEffectivelyCritical() bool {
|
||||
return DTNProtocols[e.proto] == DTNCritical || e.forceCritical
|
||||
}
|
||||
|
||||
func (e *DTNEntry) toJSON() DTNEntryJSON {
|
||||
return DTNEntryJSON{
|
||||
DID: e.did,
|
||||
ResourceID: e.resourceID,
|
||||
ForceCritical: e.forceCritical,
|
||||
Addr: e.addr,
|
||||
DT: e.dt,
|
||||
User: e.user,
|
||||
Payload: e.payload,
|
||||
Proto: e.proto,
|
||||
Retries: e.retries,
|
||||
AddedAt: e.addedAt,
|
||||
}
|
||||
}
|
||||
|
||||
func entryFromJSON(j DTNEntryJSON) *DTNEntry {
|
||||
return &DTNEntry{
|
||||
did: j.DID,
|
||||
resourceID: j.ResourceID,
|
||||
forceCritical: j.ForceCritical,
|
||||
addr: j.Addr,
|
||||
dt: j.DT,
|
||||
user: j.User,
|
||||
payload: j.Payload,
|
||||
proto: j.Proto,
|
||||
retries: j.Retries,
|
||||
addedAt: j.AddedAt,
|
||||
}
|
||||
}
|
||||
|
||||
type DTNCache struct {
|
||||
mu sync.Mutex
|
||||
entries []*DTNEntry
|
||||
// aesKey is the derived AES-256 key used for on-disk encryption.
|
||||
// Nil when key derivation failed: persistence is disabled but the in-memory
|
||||
// cache continues to function normally.
|
||||
aesKey []byte
|
||||
}
|
||||
|
||||
// newDNTCache initialises the cache, derives the encryption key, and restores
|
||||
// any critical entries that were persisted before the last crash.
|
||||
func newDNTCache() *DTNCache {
|
||||
log := oclib.GetLogger()
|
||||
c := &DTNCache{}
|
||||
key, err := deriveDNTKey()
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
|
||||
} else {
|
||||
c.aesKey = key
|
||||
c.loadFromDisk()
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// extractResourceID returns the "id" field from a JSON resource payload.
|
||||
// Returns "" when the payload is not a resource object (planner, config, etc.).
|
||||
func extractResourceID(payload []byte) string {
|
||||
var obj struct {
|
||||
ID string `json:"id"`
|
||||
}
|
||||
if err := json.Unmarshal(payload, &obj); err != nil {
|
||||
return ""
|
||||
}
|
||||
return obj.ID
|
||||
}
|
||||
|
||||
// enqueue adds an entry to the cache, respecting the resource lifecycle.
|
||||
// Deduplication key is (did, resourceID): same resource to the same peer keeps
|
||||
// only the latest mutation. resourceID is empty for non-resource payloads
|
||||
// (planner, config), in which case deduplication falls back to did alone.
|
||||
//
|
||||
// - DELETE is terminal: any subsequent mutation on the same key is discarded.
|
||||
// - UPDATE cannot be followed by CREATE: the resource already exists remotely.
|
||||
// - All other cases replace the existing entry (newer mutation supersedes).
|
||||
func (c *DTNCache) enqueue(e *DTNEntry) {
|
||||
c.mu.Lock()
|
||||
found, mutated := false, false
|
||||
for i, existing := range c.entries {
|
||||
if existing.did != e.did || existing.resourceID != e.resourceID {
|
||||
continue
|
||||
}
|
||||
found = true
|
||||
if existing.proto == ProtocolDeleteResource ||
|
||||
(existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) {
|
||||
break // discard new entry silently — existing state is authoritative
|
||||
}
|
||||
c.entries[i] = e
|
||||
mutated = true
|
||||
break
|
||||
}
|
||||
if !found {
|
||||
c.entries = append(c.entries, e)
|
||||
mutated = true
|
||||
}
|
||||
c.mu.Unlock()
|
||||
if mutated && e.isEffectivelyCritical() {
|
||||
go c.persistToDisk()
|
||||
}
|
||||
}
|
||||
|
||||
// peersWithPending returns the distinct peer IDs (did) that have at least one
|
||||
// critical entry in the cache. Used to populate Heartbeat.PendingContact.
|
||||
func (c *DTNCache) peersWithPending() []string {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
seen := map[string]struct{}{}
|
||||
var out []string
|
||||
for _, e := range c.entries {
|
||||
if e.isEffectivelyCritical() {
|
||||
if _, ok := seen[e.did]; !ok {
|
||||
seen[e.did] = struct{}{}
|
||||
out = append(out, e.did)
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// drain atomically removes and returns all current entries.
|
||||
func (c *DTNCache) drain() []*DTNEntry {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
out := c.entries
|
||||
c.entries = nil
|
||||
return out
|
||||
}
|
||||
|
||||
// requeue puts entries back at the head of the list, preserving any new
|
||||
// entries added while the retry loop was running.
|
||||
func (c *DTNCache) requeue(entries []*DTNEntry) {
|
||||
if len(entries) == 0 {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
c.entries = append(entries, c.entries...)
|
||||
}
|
||||
|
||||
// ── Persistence ──────────────────────────────────────────────────────────────
|
||||
|
||||
// DTNCachePath returns the path of the on-disk cache file, placed next to the
|
||||
// node's private key so it lives on the same persistent volume.
|
||||
func DTNCachePath() string {
|
||||
return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
|
||||
}
|
||||
|
||||
// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
|
||||
// using HKDF-SHA256. The derivation is deterministic: the same key is always
|
||||
// produced from the same private key, so no symmetric secret needs storing.
|
||||
func deriveDNTKey() ([]byte, error) {
|
||||
priv, err := tools.LoadKeyFromFilePrivate()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
|
||||
// (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM.
|
||||
raw, err := priv.Raw()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
|
||||
key := make([]byte, 32)
|
||||
if _, err := io.ReadFull(reader, key); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return key, nil
|
||||
}
|
||||
|
||||
// persistToDisk encrypts all current critical entries and writes them to disk.
|
||||
// Non-critical entries are deliberately excluded — they are not worth restoring
|
||||
// after a restart given their limited retry budget.
|
||||
func (c *DTNCache) persistToDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
c.mu.Lock()
|
||||
var toSave []DTNEntryJSON
|
||||
for _, e := range c.entries {
|
||||
if e.isEffectivelyCritical() {
|
||||
toSave = append(toSave, e.toJSON())
|
||||
}
|
||||
}
|
||||
c.mu.Unlock()
|
||||
|
||||
plaintext, err := json.Marshal(toSave)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
nonce := make([]byte, gcm.NonceSize())
|
||||
if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
|
||||
return
|
||||
}
|
||||
ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
|
||||
|
||||
path := DTNCachePath()
|
||||
tmp := path + ".tmp"
|
||||
if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to write cache file")
|
||||
return
|
||||
}
|
||||
if err := os.Rename(tmp, path); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
|
||||
_ = os.Remove(tmp)
|
||||
}
|
||||
}
|
||||
|
||||
// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
|
||||
// Errors (missing file, decryption failure) are non-fatal: the cache simply
|
||||
// starts empty, which is safe.
|
||||
func (c *DTNCache) loadFromDisk() {
|
||||
if c.aesKey == nil {
|
||||
return
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
path := DTNCachePath()
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
if !os.IsNotExist(err) {
|
||||
log.Warn().Err(err).Msg("[dnt] failed to read cache file")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
block, err := aes.NewCipher(c.aesKey)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
gcm, err := cipher.NewGCM(block)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
if len(data) < gcm.NonceSize() {
|
||||
log.Warn().Msg("[dnt] cache file too short, ignoring")
|
||||
return
|
||||
}
|
||||
nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
|
||||
plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
|
||||
if err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
var saved []DTNEntryJSON
|
||||
if err := json.Unmarshal(plaintext, &saved); err != nil {
|
||||
log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
|
||||
return
|
||||
}
|
||||
|
||||
count := 0
|
||||
for _, j := range saved {
|
||||
// Only restore critical entries — moderate entries are intentionally
|
||||
// not persisted, but this guard defends against format changes.
|
||||
e := entryFromJSON(j)
|
||||
if !e.isEffectivelyCritical() {
|
||||
continue
|
||||
}
|
||||
c.entries = append(c.entries, e)
|
||||
count++
|
||||
}
|
||||
if count > 0 {
|
||||
log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
|
||||
}
|
||||
}
|
||||
|
||||
// ── Retry loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
// startDNTLoop runs the background retry goroutine. Call once after init.
|
||||
func (s *StreamService) startDNTLoop() {
|
||||
logger := oclib.GetLogger()
|
||||
ticker := time.NewTicker(DTNRetryInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
// retryEntries attempts delivery for the given entries and returns those
|
||||
// that must be kept for the next round.
|
||||
retryEntries := func(entries []*DTNEntry) []*DTNEntry {
|
||||
var keep []*DTNEntry
|
||||
for _, e := range entries {
|
||||
_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
|
||||
if err == nil {
|
||||
if e.isEffectivelyCritical() {
|
||||
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Msg("[dnt] critical message delivered after retry")
|
||||
} else {
|
||||
logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry")
|
||||
}
|
||||
continue
|
||||
}
|
||||
if e.isEffectivelyCritical() {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
e.retries++
|
||||
if e.retries < DTNMaxModerateRetries {
|
||||
keep = append(keep, e)
|
||||
} else {
|
||||
logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did).
|
||||
Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries")
|
||||
}
|
||||
}
|
||||
}
|
||||
return keep
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
entries := s.dnt.drain()
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
s.dnt.requeue(retryEntries(entries))
|
||||
go s.dnt.persistToDisk()
|
||||
|
||||
case peerID := <-s.dntNudge:
|
||||
// A peer just signalled it is reachable — retry its entries immediately.
|
||||
entries := s.dnt.drain()
|
||||
var forPeer, other []*DTNEntry
|
||||
for _, e := range entries {
|
||||
if e.did == peerID {
|
||||
forPeer = append(forPeer, e)
|
||||
} else {
|
||||
other = append(other, e)
|
||||
}
|
||||
}
|
||||
kept := retryEntries(forPeer)
|
||||
s.dnt.requeue(append(kept, other...))
|
||||
if len(kept) < len(forPeer) {
|
||||
go s.dnt.persistToDisk()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -15,6 +15,7 @@ import (
|
||||
"cloud.o-forge.io/core/oc-lib/models/resources"
|
||||
"cloud.o-forge.io/core/oc-lib/tools"
|
||||
"github.com/libp2p/go-libp2p/core/network"
|
||||
pp "github.com/libp2p/go-libp2p/core/peer"
|
||||
)
|
||||
|
||||
type Verify struct {
|
||||
@@ -23,8 +24,18 @@ type Verify struct {
|
||||
|
||||
func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
|
||||
fmt.Println("handleEvent", protocol)
|
||||
// Heartbeat received on an outgoing ProtocolObserve stream.
|
||||
if protocol == ProtocolObserve {
|
||||
// Distinguish between an open request and a close request by inspecting
|
||||
// the ObserveRequest payload. The remote wraps both in a common.Event
|
||||
// with Type=ProtocolObserve so the persistent readLoop can decode them.
|
||||
var req ObserveRequest
|
||||
if evt.Payload != nil {
|
||||
json.Unmarshal(evt.Payload, &req) //nolint:errcheck — zero value means open
|
||||
}
|
||||
if req.Close {
|
||||
ps.observeCache.cancel(s.Conn().RemotePeer().String())
|
||||
return nil
|
||||
}
|
||||
return ps.handleIncomingObserve(s)
|
||||
}
|
||||
if protocol == observeHBEventType {
|
||||
@@ -59,6 +70,11 @@ func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s netwo
|
||||
return err
|
||||
}
|
||||
}
|
||||
if protocol == ProtocolSourcePresignResource {
|
||||
if err := ps.pass(evt, tools.SOURCE_PRESIGN_EVENT); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if protocol == ProtocolAdmiraltyConfigResource {
|
||||
if err := ps.pass(evt, tools.ADMIRALTY_CONFIG_EVENT); err != nil {
|
||||
return err
|
||||
@@ -125,9 +141,9 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
|
||||
}
|
||||
|
||||
func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
if !abs.ResourceSearches.IsActive(event.User) {
|
||||
/*if !abs.ResourceSearches.IsActive(event.User) {
|
||||
return nil // search already closed or timed out
|
||||
}
|
||||
}*/
|
||||
res, err := resources.ToResource(int(event.DataType), event.Payload)
|
||||
if err != nil || res == nil {
|
||||
return nil
|
||||
@@ -137,6 +153,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
b, err := json.Marshal(res.Serialize(res))
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: event.User,
|
||||
Datatype: tools.DataType(event.DataType),
|
||||
Method: int(tools.SEARCH_EVENT),
|
||||
Payload: b,
|
||||
@@ -147,6 +164,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
|
||||
func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) error { //
|
||||
go tools.NewNATSCaller().SetNATSPub(method, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: event.User,
|
||||
Datatype: tools.DataType(event.DataType),
|
||||
Method: int(method),
|
||||
Payload: event.Payload,
|
||||
@@ -154,6 +172,36 @@ func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) err
|
||||
return nil
|
||||
}
|
||||
|
||||
// resolveBookingNano does a single DB lookup and returns:
|
||||
//
|
||||
// (nil, true) — not a booking, dest_peer_id absent, or dest == self → process normally, no forward
|
||||
// (nano, true) — dest is one of our NANO peers → process + forward to nano
|
||||
// (nil, false) — dest is unknown → ignore
|
||||
func (ps *StreamService) resolveBookingNano(evt *common.Event) (*peer.Peer, bool) {
|
||||
if tools.DataType(evt.DataType) != tools.BOOKING {
|
||||
return nil, true
|
||||
}
|
||||
var b struct {
|
||||
DestPeerID string `json:"dest_peer_id"`
|
||||
}
|
||||
if err := json.Unmarshal(evt.Payload, &b); err != nil || b.DestPeerID == "" {
|
||||
return nil, true
|
||||
}
|
||||
if self, err := oclib.GetMySelf(); err == nil && self != nil && b.DestPeerID == self.GetID() {
|
||||
return nil, true
|
||||
}
|
||||
d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"id": {{Operator: dbs.EQUAL.String(), Value: b.DestPeerID}},
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) == 0 {
|
||||
return nil, false
|
||||
}
|
||||
return d.Data[0].(*peer.Peer), true
|
||||
}
|
||||
|
||||
func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol string) error {
|
||||
switch protocol {
|
||||
case ProtocolSearchResource:
|
||||
@@ -176,9 +224,10 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
|
||||
ps.SendResponse(p[0], evt, fmt.Sprintf("%v", search))
|
||||
}
|
||||
} else {
|
||||
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m)
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
fmt.Println("SEND SEARCH_EVENT SetNATSPub", m, evt.DataType, evt.User)
|
||||
tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.SEARCH_EVENT),
|
||||
Payload: evt.Payload,
|
||||
@@ -186,19 +235,35 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
|
||||
}
|
||||
case ProtocolCreateResource, ProtocolUpdateResource:
|
||||
fmt.Println("RECEIVED Protocol.Update", string(evt.Payload))
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
nano, ok := ps.resolveBookingNano(evt)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.CREATE_RESOURCE),
|
||||
Payload: evt.Payload,
|
||||
})
|
||||
if nano != nil {
|
||||
ps.forwardToNano(nano, evt, protocol)
|
||||
}
|
||||
case ProtocolDeleteResource:
|
||||
go tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
nano, ok := ps.resolveBookingNano(evt)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
User: evt.User,
|
||||
Datatype: tools.DataType(evt.DataType),
|
||||
Method: int(tools.REMOVE_RESOURCE),
|
||||
Payload: evt.Payload,
|
||||
})
|
||||
if nano != nil {
|
||||
ps.forwardToNano(nano, evt, protocol)
|
||||
}
|
||||
default:
|
||||
return errors.New("no action authorized available : " + protocol)
|
||||
}
|
||||
@@ -223,11 +288,31 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
|
||||
searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
|
||||
for _, ss := range searched.Data {
|
||||
// SendResponse uses an admin request so SetAllowedInstances
|
||||
// never calls FilterExploitationAuthorizations. Apply it
|
||||
// explicitly here so we never leak private AEs to a remote peer.
|
||||
if r, ok := ss.(resources.ResourceInterface); ok {
|
||||
r.SetAllowedInstances(&tools.APIRequest{PeerID: p.UUID, Groups: event.Groups, Username: event.User})
|
||||
}
|
||||
if j, err := json.Marshal(ss); err == nil {
|
||||
abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// Close the ProtocolSearchResource stream to the requester immediately after
|
||||
// sending all results. This prevents TempStream from reusing a stale (already
|
||||
// closed by the remote) stream entry for a subsequent search from the same peer,
|
||||
// which would cause write failure and no results for the second search.
|
||||
if decodedID, err := pp.Decode(p.PeerID); err == nil {
|
||||
abs.Mu.Lock()
|
||||
if abs.Streams[ProtocolSearchResource] != nil {
|
||||
if s, ok := abs.Streams[ProtocolSearchResource][decodedID]; ok {
|
||||
s.Stream.Reset()
|
||||
delete(abs.Streams[ProtocolSearchResource], decodedID)
|
||||
}
|
||||
}
|
||||
abs.Mu.Unlock()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
+181
-61
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
|
||||
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
|
||||
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
|
||||
|
||||
const observeHBInterval = 30 * time.Second
|
||||
const observeHBInterval = 10 * time.Second
|
||||
const observeDrainDuration = 30 * time.Second
|
||||
|
||||
// observeBatchWindow is the accumulation window before a heartbeat batch is
|
||||
@@ -45,7 +45,95 @@ type ObserveRequest struct {
|
||||
|
||||
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
|
||||
type ObserveHeartbeat struct {
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
|
||||
}
|
||||
|
||||
const (
|
||||
maxLatencyMs = 2000.0 // ms above which latency score → 0
|
||||
latencySamples = 5 // sliding window size for latency averaging
|
||||
fastThresholdMs = 200.0 // below = "fast", above = "slow"
|
||||
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
|
||||
)
|
||||
|
||||
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
|
||||
// Updated on every incoming heartbeat (observing side).
|
||||
type PeerObserveMetrics struct {
|
||||
mu sync.Mutex
|
||||
firstObservedAt time.Time
|
||||
lastHeartbeatAt time.Time
|
||||
received uint64
|
||||
latencies [latencySamples]time.Duration
|
||||
latIdx int
|
||||
latCount int
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) record(latency time.Duration) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.received++
|
||||
m.lastHeartbeatAt = time.Now().UTC()
|
||||
m.latencies[m.latIdx%latencySamples] = latency
|
||||
m.latIdx++
|
||||
if m.latCount < latencySamples {
|
||||
m.latCount++
|
||||
}
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
var total time.Duration
|
||||
for i := 0; i < m.latCount; i++ {
|
||||
total += m.latencies[i]
|
||||
}
|
||||
var avgMs float64
|
||||
if m.latCount > 0 {
|
||||
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
|
||||
}
|
||||
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
|
||||
fmt.Println("EXPECTED", expected, m.received)
|
||||
var missRate float64
|
||||
if expected > 0 {
|
||||
recv := int64(m.received)
|
||||
if recv > expected {
|
||||
recv = expected
|
||||
}
|
||||
missRate = 1.0 - float64(recv)/float64(expected)
|
||||
}
|
||||
latScore := 1.0 - avgMs/maxLatencyMs
|
||||
if latScore < 0 {
|
||||
latScore = 0
|
||||
}
|
||||
relScore := 1.0 - missRate
|
||||
trust := (0.35*latScore + 0.65*relScore) * 100
|
||||
|
||||
speed := "fast"
|
||||
if avgMs >= fastThresholdMs {
|
||||
speed = "slow"
|
||||
}
|
||||
reliability := "reliable"
|
||||
if relScore < reliableThreshold {
|
||||
reliability = "watch"
|
||||
}
|
||||
return PeerObserveSnapshot{
|
||||
LatencyMs: avgMs,
|
||||
Speed: speed,
|
||||
Reliability: reliability,
|
||||
TrustScore: trust,
|
||||
LastSeenAt: m.lastHeartbeatAt,
|
||||
MissRate: missRate,
|
||||
}
|
||||
}
|
||||
|
||||
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
|
||||
type PeerObserveSnapshot struct {
|
||||
LatencyMs float64 `json:"latency_ms"`
|
||||
Speed string `json:"speed"` // "fast" | "slow"
|
||||
Reliability string `json:"reliability"` // "reliable" | "watch"
|
||||
TrustScore float64 `json:"trust_score"`
|
||||
LastSeenAt time.Time `json:"last_seen_at"`
|
||||
MissRate float64 `json:"miss_rate"`
|
||||
}
|
||||
|
||||
// ShallowPeer is the minimal peer representation sent by oc-peer in a
|
||||
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
|
||||
|
||||
// ── incoming observe handler (observed side) ──────────────────────────────────
|
||||
|
||||
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
|
||||
// It is called when a remote peer opens an observe stream to us.
|
||||
// The function reads the request, validates it, then starts (or stops) the
|
||||
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
|
||||
// handleIncomingObserve is called when a remote peer opens an observe stream
|
||||
// to us (observed side). It starts a heartbeat goroutine that writes back on
|
||||
// the same bidirectional rawStream — no separate reverse stream is opened.
|
||||
// The goroutine stops via context cancellation (triggered by a close event
|
||||
// read from rawStream) or when rawStream becomes unwritable.
|
||||
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
remotePeerID := rawStream.Conn().RemotePeer().String()
|
||||
addr := rawStream.Conn().RemoteMultiaddr().String()
|
||||
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
|
||||
if err != nil {
|
||||
fmt.Println("qndlqnl EERR", addr, err)
|
||||
return err
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
|
||||
// Drain mode: reject any new observations for 30 s after a close-all.
|
||||
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
|
||||
s.drainMu.RUnlock()
|
||||
if draining {
|
||||
rawStream.Close()
|
||||
fmt.Println("Draining")
|
||||
return errors.New("Draining")
|
||||
return errors.New("draining")
|
||||
}
|
||||
// Read the observe request (with a generous deadline to avoid hangs).
|
||||
// Guard: the requesting peer must not be blacklisted or be ourself.
|
||||
did := ""
|
||||
|
||||
// Guard: the requesting peer must not be blacklisted.
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
res := access.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
}, "", false, 0, 1)
|
||||
if len(res.Data) > 0 {
|
||||
p := res.Data[0].(*peer.Peer)
|
||||
did = p.GetID()
|
||||
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
|
||||
rawStream.Close()
|
||||
if p.Relation == peer.BLACKLIST {
|
||||
fmt.Println("CLOSE blacklist or self")
|
||||
return errors.New("can't exploit blacklist or self")
|
||||
return errors.New("can't observe blacklisted peer")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
s.observeCache.set(remotePeerID, cancel)
|
||||
fmt.Println("LOOP OBSERVE")
|
||||
go func() {
|
||||
defer rawStream.Close()
|
||||
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
|
||||
// owns rawStream's lifecycle. We only stop writing.
|
||||
defer cancel()
|
||||
defer s.observeCache.delete(remotePeerID)
|
||||
|
||||
ticker := time.NewTicker(observeHBInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
|
||||
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
|
||||
if evt == nil {
|
||||
return
|
||||
}
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
buildHBEvent := func() *common.Event {
|
||||
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
|
||||
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
|
||||
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
|
||||
fmt.Println("LOOP EVT", evt)
|
||||
var err error
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
evt := buildHBEvent()
|
||||
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
|
||||
return
|
||||
}
|
||||
rawStream.SetWriteDeadline(time.Time{})
|
||||
}
|
||||
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
// ── heartbeat receiver (observing side) ───────────────────────────────────────
|
||||
|
||||
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
|
||||
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
|
||||
// accumulator; the batcher flushes to NATS after observeBatchWindow.
|
||||
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
|
||||
// a quality snapshot to NATS.
|
||||
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
|
||||
// ps.hbBatcher.add(evt.From)
|
||||
flushObserveBatch([]string{evt.From})
|
||||
var hb ObserveHeartbeat
|
||||
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
|
||||
latency := time.Since(hb.SentAt)
|
||||
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
|
||||
firstObservedAt: time.Now().UTC(),
|
||||
})
|
||||
raw.(*PeerObserveMetrics).record(latency)
|
||||
fmt.Println("METRICS", raw)
|
||||
ps.observeMetrics.Store(evt.From, raw)
|
||||
}
|
||||
ps.flushObserveForPeer(evt.From, evt.User)
|
||||
return nil
|
||||
}
|
||||
|
||||
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
|
||||
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
|
||||
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
|
||||
var snap *PeerObserveSnapshot
|
||||
if raw, ok := ps.observeMetrics.Load(peerID); ok {
|
||||
fmt.Println("RETRIEVED METRICS", raw)
|
||||
s := raw.(*PeerObserveMetrics).snapshot()
|
||||
snap = &s
|
||||
}
|
||||
fmt.Println("RETRIEVED METRICS 2", snap)
|
||||
payload, err := json.Marshal(map[string]interface{}{
|
||||
"peer_ids": []string{peerID},
|
||||
"state": "online",
|
||||
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
propPayload, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.PEER),
|
||||
Action: tools.PB_PROPAGATE,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: propPayload,
|
||||
})
|
||||
}
|
||||
|
||||
// ── user→peer index (ref-counted observe management) ─────────────────────────
|
||||
|
||||
// userPeerIndex tracks which users are observing which peers.
|
||||
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
|
||||
}
|
||||
|
||||
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
|
||||
// the remote side.
|
||||
// the remote side. The close event is wrapped in a common.Event so the remote's
|
||||
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
|
||||
func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
decodedID, err := pp.Decode(toPeerID)
|
||||
if err != nil {
|
||||
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
ps.Mu.Lock()
|
||||
if ps.Streams[ProtocolObserve] != nil {
|
||||
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
delete(ps.Streams[ProtocolObserve], decodedID)
|
||||
}
|
||||
}
|
||||
ps.Mu.Unlock()
|
||||
ps.observeMetrics.Delete(toPeerID)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
func (ps *StreamService) CloseAllObserves() {
|
||||
ps.Mu.Lock()
|
||||
for _, s := range ps.Streams[ProtocolObserve] {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps.Streams, ProtocolObserve)
|
||||
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
|
||||
|
||||
// Reset user index so stale ref-counts don't block future opens.
|
||||
ps.observeUsers = newUserPeerIndex()
|
||||
ps.observeMetrics.Range(func(k, _ any) bool {
|
||||
ps.observeMetrics.Delete(k)
|
||||
return true
|
||||
})
|
||||
|
||||
ps.drainMu.Lock()
|
||||
ps.drainUntil = time.Now().Add(observeDrainDuration)
|
||||
|
||||
@@ -61,15 +61,17 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
|
||||
}
|
||||
stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
|
||||
if err != nil {
|
||||
if _, ok := dntProtocols[proto]; ok {
|
||||
ps.dnt.enqueue(&dntEntry{
|
||||
did: toPeerID,
|
||||
addr: *ad,
|
||||
dt: dt,
|
||||
user: user,
|
||||
payload: resource,
|
||||
proto: proto,
|
||||
addedAt: time.Now().UTC(),
|
||||
if _, ok := DTNProtocols[proto]; ok {
|
||||
ps.dnt.enqueue(&DTNEntry{
|
||||
did: toPeerID,
|
||||
resourceID: extractResourceID(resource),
|
||||
forceCritical: pe.Relation == peer.NANO,
|
||||
addr: *ad,
|
||||
dt: dt,
|
||||
user: user,
|
||||
payload: resource,
|
||||
proto: proto,
|
||||
addedAt: time.Now().UTC(),
|
||||
})
|
||||
}
|
||||
return nil, err
|
||||
@@ -125,20 +127,45 @@ func (ps *StreamService) ToPartnerPublishEvent(
|
||||
|
||||
return nil
|
||||
}
|
||||
ks := []protocol.ID{}
|
||||
for k := range protocolsPartners {
|
||||
ks = append(ks, k)
|
||||
// Extract creator_id to route to the correct nano.
|
||||
// A master must only forward a resource to the nano that owns it.
|
||||
var creatorID string
|
||||
var minPayload struct {
|
||||
CreatorID string `json:"creator_id"`
|
||||
}
|
||||
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
|
||||
if json.Unmarshal(payload, &minPayload) == nil {
|
||||
creatorID = minPayload.CreatorID
|
||||
}
|
||||
|
||||
// PARTNER and MASTER receive every resource unconditionally.
|
||||
for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER} {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
|
||||
// NANO: only send to the nano whose UUID matches the resource creator.
|
||||
if creatorID != "" {
|
||||
ps.PublishesCommon(dt, user, groups, &dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
|
||||
"id": {{Operator: dbs.EQUAL.String(), Value: creatorID}},
|
||||
},
|
||||
}, payload, proto)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// forwardToNano sends a booking mutation directly to a known NANO peer.
|
||||
// The NANO peer is already resolved by the caller (resolveBookingNano).
|
||||
// DTN critical is applied automatically by PublishCommon (Relation == NANO).
|
||||
func (abs *StreamService) forwardToNano(nano *peer.Peer, evt *common.Event, proto string) {
|
||||
dt := tools.DataType(evt.DataType)
|
||||
abs.PublishCommon(&dt, evt.User, evt.Groups, nano.PeerID, protocol.ID(proto), evt.Payload)
|
||||
}
|
||||
|
||||
func (s *StreamService) write(
|
||||
did string,
|
||||
peerID *pp.AddrInfo,
|
||||
|
||||
@@ -27,6 +27,10 @@ const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
|
||||
const ProtocolMinioConfigResource = "/opencloud/minio/config/1.0"
|
||||
const ProtocolAdmiraltyConfigResource = "/opencloud/admiralty/config/1.0"
|
||||
|
||||
// ProtocolSourcePresignResource routes PB_SOURCE_PRESIGN to the resource-owner peer.
|
||||
// The owner generates a pre-signed Minio URL and responds via PB_CONSIDERS.
|
||||
const ProtocolSourcePresignResource = "/opencloud/resource/source-presign/1.0"
|
||||
|
||||
const ProtocolSearchResource = "/opencloud/resource/search/1.0"
|
||||
const ProtocolCreateResource = "/opencloud/resource/create/1.0"
|
||||
const ProtocolUpdateResource = "/opencloud/resource/update/1.0"
|
||||
@@ -43,6 +47,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
|
||||
ProtocolVerifyResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolMinioConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolSourcePresignResource: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
ProtocolObserve: {WaitResponse: true, TTL: 1 * time.Minute},
|
||||
}
|
||||
|
||||
@@ -63,8 +68,8 @@ type StreamService struct {
|
||||
// IsPeerKnown, when set, is called at stream open for every inbound protocol.
|
||||
// Return false to reset the stream immediately. Left nil until wired by the node.
|
||||
IsPeerKnown func(pid pp.ID) bool
|
||||
// dnt is the Disconnection Network Tolerance cache for outbound streams.
|
||||
dnt *dntCache
|
||||
// DTN is the Disconnection Network Tolerance cache for outbound streams.
|
||||
dnt *DTNCache
|
||||
// observeCache tracks running heartbeat goroutines on the OBSERVED side.
|
||||
observeCache *observeCache
|
||||
// hbBatcher accumulates incoming heartbeats (observing side) and flushes
|
||||
@@ -78,6 +83,12 @@ type StreamService struct {
|
||||
// observeUsers tracks which users are observing which peers so streams are
|
||||
// closed only when the last observer for a peer disconnects.
|
||||
observeUsers *userPeerIndex
|
||||
// observeMetrics accumulates connection-quality data per observed peer (observing side).
|
||||
// Keys are peer_id strings; values are *PeerObserveMetrics.
|
||||
observeMetrics sync.Map
|
||||
// DTNNudge receives peer IDs for which an immediate DTN retry should be
|
||||
// attempted (e.g. when the peer just reconnected via PendingCallers).
|
||||
dntNudge chan string
|
||||
}
|
||||
|
||||
func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
|
||||
@@ -92,6 +103,7 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
|
||||
dnt: newDNTCache(),
|
||||
observeCache: newObserveCache(),
|
||||
observeUsers: newUserPeerIndex(),
|
||||
dntNudge: make(chan string, 32),
|
||||
}
|
||||
service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
|
||||
for proto := range protocols {
|
||||
@@ -105,6 +117,23 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
|
||||
return service, nil
|
||||
}
|
||||
|
||||
// PendingContacts returns the peer IDs that have at least one critical DTN
|
||||
// entry pending. Called on each heartbeat tick to populate PendingContact.
|
||||
func (s *StreamService) PendingContacts() []string {
|
||||
return s.dnt.peersWithPending()
|
||||
}
|
||||
|
||||
// NudgeContacts signals the DTN loop to retry immediately for the given peer
|
||||
// IDs (typically received via HeartbeatResponse.PendingCallers).
|
||||
func (s *StreamService) NudgeContacts(peerIDs []string) {
|
||||
for _, id := range peerIDs {
|
||||
select {
|
||||
case s.dntNudge <- id:
|
||||
default:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gate wraps a stream handler with IsPeerKnown validation.
|
||||
// If the peer is unknown the entire connection is closed and the handler is not called.
|
||||
// IsPeerKnown is read at stream-open time so it works even when set after InitStream.
|
||||
@@ -117,6 +146,17 @@ func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Strea
|
||||
},
|
||||
}, "", false, 0, 1)
|
||||
if len(d.Data) == 0 {
|
||||
stream.Reset()
|
||||
return
|
||||
}
|
||||
master := d.Data[0].(*peer.Peer)
|
||||
if stream.Conn().RemotePeer().String() != master.PeerID {
|
||||
logger := oclib.GetLogger()
|
||||
logger.Warn().
|
||||
Str("remote", stream.Conn().RemotePeer().String()).
|
||||
Str("master", master.PeerID).
|
||||
Msg("[gate] nano rejected stream from non-master peer")
|
||||
stream.Reset()
|
||||
return
|
||||
}
|
||||
}
|
||||
@@ -162,9 +202,17 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
|
||||
Stream: stream,
|
||||
Expiry: time.Now().UTC().Add(expiry + 1*time.Minute),
|
||||
}
|
||||
|
||||
// ProtocolObserve uses a bidirectional long-lived stream: the remote writes
|
||||
// heartbeats back on the same stream, and may later send a close event.
|
||||
// Use a persistent readLoop so we can receive both heartbeats and close events.
|
||||
protoInfo := protocols[stream.Protocol()]
|
||||
if stream.Protocol() == ProtocolObserve {
|
||||
protoInfo = &common.ProtocolInfo{PersistantStream: true}
|
||||
}
|
||||
go s.readLoop(s.Streams[stream.Protocol()][stream.Conn().RemotePeer()],
|
||||
stream.Conn().RemotePeer(),
|
||||
stream.Protocol(), protocols[stream.Protocol()])
|
||||
stream.Protocol(), protoInfo)
|
||||
}
|
||||
|
||||
func (s *StreamService) connectToPartners() error {
|
||||
|
||||
Reference in New Issue
Block a user