Discovery Neo Oclib

2026-05-27 16:17:00 +02:00
parent 7f951afd41
commit 6ce6e6fe7d
20 changed files with 1436 additions and 1133 deletions
@@ -1,362 +0,0 @@
-package stream
-
-// dnt_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
-//
-// When a stream write fails because the remote peer is unreachable, the request
-// is saved here and retried on the next tick.  Two levels are defined:
-//
-//   - dntCritical : retry indefinitely (create / update / delete resource).
-//   - dntModerate : up to dntMaxModerateRetries retries, then abandon.
-//
-// Pubsub messages and search streams are explicitly excluded.
-// Streams initiated from the indexer side are never enqueued here.
-//
-// # Crash-resilient persistence
-//
-// Critical entries are written to an encrypted file (AES-256-GCM) so they
-// survive a node crash/restart.  The AES key is derived deterministically from
-// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
-// Moderate entries are intentionally not persisted: their retry budget is small
-// enough that re-loading them after a restart would be misleading.
-
-import (
-	"crypto/aes"
-	"crypto/cipher"
-	"crypto/rand"
-	"crypto/sha256"
-	"encoding/json"
-	"io"
-	"os"
-	"path/filepath"
-	"sync"
-	"time"
-
-	oclib "cloud.o-forge.io/core/oc-lib"
-	"cloud.o-forge.io/core/oc-lib/tools"
-	"golang.org/x/crypto/hkdf"
-
-	"oc-discovery/conf"
-
-	pp "github.com/libp2p/go-libp2p/core/peer"
-	"github.com/libp2p/go-libp2p/core/protocol"
-)
-
-type dntLevel int
-
-const (
-	dntCritical dntLevel = iota // retry until the message is delivered
-	dntModerate                 // retry up to dntMaxModerateRetries times
-)
-
-const dntMaxModerateRetries = 3
-const dntRetryInterval = 15 * time.Second
-
-// dntProtocols maps each stream protocol to its DNT level.
-// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
-var dntProtocols = map[protocol.ID]dntLevel{
-	// Critical — data mutations that must eventually be delivered.
-	ProtocolCreateResource: dntCritical,
-	ProtocolUpdateResource: dntCritical,
-	ProtocolDeleteResource: dntCritical,
-	// Moderate — confirmations / config / planner: 3 retries before abandon.
-	ProtocolVerifyResource:          dntModerate,
-	ProtocolSendPlanner:             dntModerate,
-	ProtocolConsidersResource:       dntModerate,
-	ProtocolMinioConfigResource:     dntModerate,
-	ProtocolAdmiraltyConfigResource: dntModerate,
-}
-
-// dntEntryJSON is the on-disk representation of a dntEntry.
-// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
-type dntEntryJSON struct {
-	DID     string           `json:"did"`
-	Addr    pp.AddrInfo      `json:"addr"`
-	DT      *tools.DataType  `json:"dt,omitempty"`
-	User    string           `json:"user"`
-	Payload []byte           `json:"payload"`
-	Proto   protocol.ID      `json:"proto"`
-	Retries int              `json:"retries"`
-	AddedAt time.Time        `json:"added_at"`
-}
-
-type dntEntry struct {
-	did     string
-	addr    pp.AddrInfo
-	dt      *tools.DataType
-	user    string
-	payload []byte
-	proto   protocol.ID
-	retries int
-	addedAt time.Time
-}
-
-func (e *dntEntry) toJSON() dntEntryJSON {
-	return dntEntryJSON{
-		DID:     e.did,
-		Addr:    e.addr,
-		DT:      e.dt,
-		User:    e.user,
-		Payload: e.payload,
-		Proto:   e.proto,
-		Retries: e.retries,
-		AddedAt: e.addedAt,
-	}
-}
-
-func entryFromJSON(j dntEntryJSON) *dntEntry {
-	return &dntEntry{
-		did:     j.DID,
-		addr:    j.Addr,
-		dt:      j.DT,
-		user:    j.User,
-		payload: j.Payload,
-		proto:   j.Proto,
-		retries: j.Retries,
-		addedAt: j.AddedAt,
-	}
-}
-
-type dntCache struct {
-	mu      sync.Mutex
-	entries []*dntEntry
-	// aesKey is the derived AES-256 key used for on-disk encryption.
-	// Nil when key derivation failed: persistence is disabled but the in-memory
-	// cache continues to function normally.
-	aesKey []byte
-}
-
-// newDNTCache initialises the cache, derives the encryption key, and restores
-// any critical entries that were persisted before the last crash.
-func newDNTCache() *dntCache {
-	log := oclib.GetLogger()
-	c := &dntCache{}
-	key, err := deriveDNTKey()
-	if err != nil {
-		log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
-	} else {
-		c.aesKey = key
-		c.loadFromDisk()
-	}
-	return c
-}
-
-// enqueue adds an entry to the cache and persists critical entries to disk.
-func (c *dntCache) enqueue(e *dntEntry) {
-	c.mu.Lock()
-	c.entries = append(c.entries, e)
-	c.mu.Unlock()
-	if dntProtocols[e.proto] == dntCritical {
-		go c.persistToDisk()
-	}
-}
-
-// drain atomically removes and returns all current entries.
-func (c *dntCache) drain() []*dntEntry {
-	c.mu.Lock()
-	defer c.mu.Unlock()
-	out := c.entries
-	c.entries = nil
-	return out
-}
-
-// requeue puts entries back at the head of the list, preserving any new
-// entries added while the retry loop was running.
-func (c *dntCache) requeue(entries []*dntEntry) {
-	if len(entries) == 0 {
-		return
-	}
-	c.mu.Lock()
-	defer c.mu.Unlock()
-	c.entries = append(entries, c.entries...)
-}
-
-// ── Persistence ──────────────────────────────────────────────────────────────
-
-// dntCachePath returns the path of the on-disk cache file, placed next to the
-// node's private key so it lives on the same persistent volume.
-func dntCachePath() string {
-	return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
-}
-
-// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
-// using HKDF-SHA256.  The derivation is deterministic: the same key is always
-// produced from the same private key, so no symmetric secret needs storing.
-func deriveDNTKey() ([]byte, error) {
-	priv, err := tools.LoadKeyFromFilePrivate()
-	if err != nil {
-		return nil, err
-	}
-	// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
-	// (32-byte seed || 32-byte public key).  We use the full 64 bytes as IKM.
-	raw, err := priv.Raw()
-	if err != nil {
-		return nil, err
-	}
-	reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
-	key := make([]byte, 32)
-	if _, err := io.ReadFull(reader, key); err != nil {
-		return nil, err
-	}
-	return key, nil
-}
-
-// persistToDisk encrypts all current critical entries and writes them to disk.
-// Non-critical entries are deliberately excluded — they are not worth restoring
-// after a restart given their limited retry budget.
-func (c *dntCache) persistToDisk() {
-	if c.aesKey == nil {
-		return
-	}
-	log := oclib.GetLogger()
-	c.mu.Lock()
-	var toSave []dntEntryJSON
-	for _, e := range c.entries {
-		if dntProtocols[e.proto] == dntCritical {
-			toSave = append(toSave, e.toJSON())
-		}
-	}
-	c.mu.Unlock()
-
-	plaintext, err := json.Marshal(toSave)
-	if err != nil {
-		return
-	}
-
-	block, err := aes.NewCipher(c.aesKey)
-	if err != nil {
-		return
-	}
-	gcm, err := cipher.NewGCM(block)
-	if err != nil {
-		return
-	}
-	nonce := make([]byte, gcm.NonceSize())
-	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
-		return
-	}
-	ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
-
-	path := dntCachePath()
-	tmp := path + ".tmp"
-	if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
-		log.Warn().Err(err).Msg("[dnt] failed to write cache file")
-		return
-	}
-	if err := os.Rename(tmp, path); err != nil {
-		log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
-		_ = os.Remove(tmp)
-	}
-}
-
-// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
-// Errors (missing file, decryption failure) are non-fatal: the cache simply
-// starts empty, which is safe.
-func (c *dntCache) loadFromDisk() {
-	if c.aesKey == nil {
-		return
-	}
-	log := oclib.GetLogger()
-	path := dntCachePath()
-	data, err := os.ReadFile(path)
-	if err != nil {
-		if !os.IsNotExist(err) {
-			log.Warn().Err(err).Msg("[dnt] failed to read cache file")
-		}
-		return
-	}
-
-	block, err := aes.NewCipher(c.aesKey)
-	if err != nil {
-		return
-	}
-	gcm, err := cipher.NewGCM(block)
-	if err != nil {
-		return
-	}
-	if len(data) < gcm.NonceSize() {
-		log.Warn().Msg("[dnt] cache file too short, ignoring")
-		return
-	}
-	nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
-	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
-	if err != nil {
-		log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
-		return
-	}
-
-	var saved []dntEntryJSON
-	if err := json.Unmarshal(plaintext, &saved); err != nil {
-		log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
-		return
-	}
-
-	count := 0
-	for _, j := range saved {
-		// Only restore critical entries — moderate entries are intentionally
-		// not persisted, but this guard defends against format changes.
-		if dntProtocols[j.Proto] != dntCritical {
-			continue
-		}
-		c.entries = append(c.entries, entryFromJSON(j))
-		count++
-	}
-	if count > 0 {
-		log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
-	}
-}
-
-// ── Retry loop ────────────────────────────────────────────────────────────────
-
-// startDNTLoop runs the background retry goroutine.  Call once after init.
-func (s *StreamService) startDNTLoop() {
-	logger := oclib.GetLogger()
-	ticker := time.NewTicker(dntRetryInterval)
-	defer ticker.Stop()
-	for range ticker.C {
-		entries := s.dnt.drain()
-		if len(entries) == 0 {
-			continue
-		}
-		var keep []*dntEntry
-		for _, e := range entries {
-			_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
-			if err == nil {
-				level := dntProtocols[e.proto]
-				if level == dntCritical {
-					logger.Info().
-						Str("proto", string(e.proto)).
-						Str("peer", e.did).
-						Msg("[dnt] critical message delivered after retry")
-				} else {
-					logger.Info().
-						Str("proto", string(e.proto)).
-						Str("peer", e.did).
-						Int("retries", e.retries).
-						Msg("[dnt] moderate message delivered after retry")
-				}
-				continue
-			}
-			level := dntProtocols[e.proto]
-			switch level {
-			case dntCritical:
-				keep = append(keep, e)
-			case dntModerate:
-				e.retries++
-				if e.retries < dntMaxModerateRetries {
-					keep = append(keep, e)
-				} else {
-					logger.Warn().
-						Str("proto", string(e.proto)).
-						Str("peer", e.did).
-						Int("retries", e.retries).
-						Msg("[dnt] moderate message abandoned after max retries")
-				}
-			}
-		}
-		s.dnt.requeue(keep)
-		// Persist after each tick so the on-disk file reflects the current
-		// state (entries delivered are removed, new ones from concurrent
-		// enqueues are included).
-		go s.dnt.persistToDisk()
-	}
-}
@@ -0,0 +1,446 @@
+package stream
+
+// DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests.
+//
+// When a stream write fails because the remote peer is unreachable, the request
+// is saved here and retried on the next tick.  Two levels are defined:
+//
+//   - DTNCritical : retry indefinitely (create / update / delete resource).
+//   - DTNModerate : up to DTNMaxModerateRetries retries, then abandon.
+//
+// Pubsub messages and search streams are explicitly excluded.
+// Streams initiated from the indexer side are never enqueued here.
+//
+// # Crash-resilient persistence
+//
+// Critical entries are written to an encrypted file (AES-256-GCM) so they
+// survive a node crash/restart.  The AES key is derived deterministically from
+// the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage.
+// Moderate entries are intentionally not persisted: their retry budget is small
+// enough that re-loading them after a restart would be misleading.
+
+import (
+	"crypto/aes"
+	"crypto/cipher"
+	"crypto/rand"
+	"crypto/sha256"
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"sync"
+	"time"
+
+	oclib "cloud.o-forge.io/core/oc-lib"
+	"cloud.o-forge.io/core/oc-lib/tools"
+	"golang.org/x/crypto/hkdf"
+
+	"oc-discovery/conf"
+
+	pp "github.com/libp2p/go-libp2p/core/peer"
+	"github.com/libp2p/go-libp2p/core/protocol"
+)
+
+type DTNLevel int
+
+const (
+	DTNCritical DTNLevel = iota // retry until the message is delivered
+	DTNModerate                 // retry up to DTNMaxModerateRetries times
+)
+
+const DTNMaxModerateRetries = 3
+const DTNRetryInterval = 15 * time.Second
+
+// DTNProtocols maps each stream protocol to its DTN level.
+// Protocols absent from this map receive no caching (e.g. ProtocolSearchResource).
+var DTNProtocols = map[protocol.ID]DTNLevel{
+	// Critical — data mutations that must eventually be delivered.
+	ProtocolCreateResource: DTNCritical,
+	ProtocolUpdateResource: DTNCritical,
+	ProtocolDeleteResource: DTNCritical,
+	// Moderate — confirmations / config / planner: 3 retries before abandon.
+	ProtocolVerifyResource:          DTNModerate,
+	ProtocolSendPlanner:             DTNModerate,
+	ProtocolConsidersResource:       DTNModerate,
+	ProtocolMinioConfigResource:     DTNModerate,
+	ProtocolAdmiraltyConfigResource: DTNModerate,
+	ProtocolSourcePresignResource:   DTNModerate,
+}
+
+// DTNEntryJSON is the on-disk representation of a DTNEntry.
+// pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them.
+type DTNEntryJSON struct {
+	DID           string          `json:"did"`
+	ResourceID    string          `json:"resource_id,omitempty"`
+	ForceCritical bool            `json:"force_critical,omitempty"`
+	Addr          pp.AddrInfo     `json:"addr"`
+	DT            *tools.DataType `json:"dt,omitempty"`
+	User          string          `json:"user"`
+	Payload       []byte          `json:"payload"`
+	Proto         protocol.ID     `json:"proto"`
+	Retries       int             `json:"retries"`
+	AddedAt       time.Time       `json:"added_at"`
+}
+
+type DTNEntry struct {
+	did           string
+	resourceID    string // UUID of the resource; empty for non-resource payloads (planner, config)
+	forceCritical bool   // true when destination is NANO: all protocols become critical
+	addr          pp.AddrInfo
+	dt            *tools.DataType
+	user          string
+	payload       []byte
+	proto         protocol.ID
+	retries       int
+	addedAt       time.Time
+}
+
+// isEffectivelyCritical returns true when the entry must be retried indefinitely,
+// either because its protocol is inherently critical or because the destination
+// is a NANO peer (forceCritical).
+func (e *DTNEntry) isEffectivelyCritical() bool {
+	return DTNProtocols[e.proto] == DTNCritical || e.forceCritical
+}
+
+func (e *DTNEntry) toJSON() DTNEntryJSON {
+	return DTNEntryJSON{
+		DID:           e.did,
+		ResourceID:    e.resourceID,
+		ForceCritical: e.forceCritical,
+		Addr:          e.addr,
+		DT:            e.dt,
+		User:          e.user,
+		Payload:       e.payload,
+		Proto:         e.proto,
+		Retries:       e.retries,
+		AddedAt:       e.addedAt,
+	}
+}
+
+func entryFromJSON(j DTNEntryJSON) *DTNEntry {
+	return &DTNEntry{
+		did:           j.DID,
+		resourceID:    j.ResourceID,
+		forceCritical: j.ForceCritical,
+		addr:          j.Addr,
+		dt:            j.DT,
+		user:          j.User,
+		payload:       j.Payload,
+		proto:         j.Proto,
+		retries:       j.Retries,
+		addedAt:       j.AddedAt,
+	}
+}
+
+type DTNCache struct {
+	mu      sync.Mutex
+	entries []*DTNEntry
+	// aesKey is the derived AES-256 key used for on-disk encryption.
+	// Nil when key derivation failed: persistence is disabled but the in-memory
+	// cache continues to function normally.
+	aesKey []byte
+}
+
+// newDNTCache initialises the cache, derives the encryption key, and restores
+// any critical entries that were persisted before the last crash.
+func newDNTCache() *DTNCache {
+	log := oclib.GetLogger()
+	c := &DTNCache{}
+	key, err := deriveDNTKey()
+	if err != nil {
+		log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled")
+	} else {
+		c.aesKey = key
+		c.loadFromDisk()
+	}
+	return c
+}
+
+// extractResourceID returns the "id" field from a JSON resource payload.
+// Returns "" when the payload is not a resource object (planner, config, etc.).
+func extractResourceID(payload []byte) string {
+	var obj struct {
+		ID string `json:"id"`
+	}
+	if err := json.Unmarshal(payload, &obj); err != nil {
+		return ""
+	}
+	return obj.ID
+}
+
+// enqueue adds an entry to the cache, respecting the resource lifecycle.
+// Deduplication key is (did, resourceID): same resource to the same peer keeps
+// only the latest mutation.  resourceID is empty for non-resource payloads
+// (planner, config), in which case deduplication falls back to did alone.
+//
+//   - DELETE is terminal: any subsequent mutation on the same key is discarded.
+//   - UPDATE cannot be followed by CREATE: the resource already exists remotely.
+//   - All other cases replace the existing entry (newer mutation supersedes).
+func (c *DTNCache) enqueue(e *DTNEntry) {
+	c.mu.Lock()
+	found, mutated := false, false
+	for i, existing := range c.entries {
+		if existing.did != e.did || existing.resourceID != e.resourceID {
+			continue
+		}
+		found = true
+		if existing.proto == ProtocolDeleteResource ||
+			(existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) {
+			break // discard new entry silently — existing state is authoritative
+		}
+		c.entries[i] = e
+		mutated = true
+		break
+	}
+	if !found {
+		c.entries = append(c.entries, e)
+		mutated = true
+	}
+	c.mu.Unlock()
+	if mutated && e.isEffectivelyCritical() {
+		go c.persistToDisk()
+	}
+}
+
+// peersWithPending returns the distinct peer IDs (did) that have at least one
+// critical entry in the cache.  Used to populate Heartbeat.PendingContact.
+func (c *DTNCache) peersWithPending() []string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	seen := map[string]struct{}{}
+	var out []string
+	for _, e := range c.entries {
+		if e.isEffectivelyCritical() {
+			if _, ok := seen[e.did]; !ok {
+				seen[e.did] = struct{}{}
+				out = append(out, e.did)
+			}
+		}
+	}
+	return out
+}
+
+// drain atomically removes and returns all current entries.
+func (c *DTNCache) drain() []*DTNEntry {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := c.entries
+	c.entries = nil
+	return out
+}
+
+// requeue puts entries back at the head of the list, preserving any new
+// entries added while the retry loop was running.
+func (c *DTNCache) requeue(entries []*DTNEntry) {
+	if len(entries) == 0 {
+		return
+	}
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.entries = append(entries, c.entries...)
+}
+
+// ── Persistence ──────────────────────────────────────────────────────────────
+
+// DTNCachePath returns the path of the on-disk cache file, placed next to the
+// node's private key so it lives on the same persistent volume.
+func DTNCachePath() string {
+	return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin")
+}
+
+// deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key
+// using HKDF-SHA256.  The derivation is deterministic: the same key is always
+// produced from the same private key, so no symmetric secret needs storing.
+func deriveDNTKey() ([]byte, error) {
+	priv, err := tools.LoadKeyFromFilePrivate()
+	if err != nil {
+		return nil, err
+	}
+	// Raw() on a libp2p Ed25519 private key returns the 64-byte representation
+	// (32-byte seed || 32-byte public key).  We use the full 64 bytes as IKM.
+	raw, err := priv.Raw()
+	if err != nil {
+		return nil, err
+	}
+	reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1"))
+	key := make([]byte, 32)
+	if _, err := io.ReadFull(reader, key); err != nil {
+		return nil, err
+	}
+	return key, nil
+}
+
+// persistToDisk encrypts all current critical entries and writes them to disk.
+// Non-critical entries are deliberately excluded — they are not worth restoring
+// after a restart given their limited retry budget.
+func (c *DTNCache) persistToDisk() {
+	if c.aesKey == nil {
+		return
+	}
+	log := oclib.GetLogger()
+	c.mu.Lock()
+	var toSave []DTNEntryJSON
+	for _, e := range c.entries {
+		if e.isEffectivelyCritical() {
+			toSave = append(toSave, e.toJSON())
+		}
+	}
+	c.mu.Unlock()
+
+	plaintext, err := json.Marshal(toSave)
+	if err != nil {
+		return
+	}
+
+	block, err := aes.NewCipher(c.aesKey)
+	if err != nil {
+		return
+	}
+	gcm, err := cipher.NewGCM(block)
+	if err != nil {
+		return
+	}
+	nonce := make([]byte, gcm.NonceSize())
+	if _, err := io.ReadFull(rand.Reader, nonce); err != nil {
+		return
+	}
+	ciphertext := gcm.Seal(nonce, nonce, plaintext, nil)
+
+	path := DTNCachePath()
+	tmp := path + ".tmp"
+	if err := os.WriteFile(tmp, ciphertext, 0600); err != nil {
+		log.Warn().Err(err).Msg("[dnt] failed to write cache file")
+		return
+	}
+	if err := os.Rename(tmp, path); err != nil {
+		log.Warn().Err(err).Msg("[dnt] failed to rename cache file")
+		_ = os.Remove(tmp)
+	}
+}
+
+// loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries.
+// Errors (missing file, decryption failure) are non-fatal: the cache simply
+// starts empty, which is safe.
+func (c *DTNCache) loadFromDisk() {
+	if c.aesKey == nil {
+		return
+	}
+	log := oclib.GetLogger()
+	path := DTNCachePath()
+	data, err := os.ReadFile(path)
+	if err != nil {
+		if !os.IsNotExist(err) {
+			log.Warn().Err(err).Msg("[dnt] failed to read cache file")
+		}
+		return
+	}
+
+	block, err := aes.NewCipher(c.aesKey)
+	if err != nil {
+		return
+	}
+	gcm, err := cipher.NewGCM(block)
+	if err != nil {
+		return
+	}
+	if len(data) < gcm.NonceSize() {
+		log.Warn().Msg("[dnt] cache file too short, ignoring")
+		return
+	}
+	nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():]
+	plaintext, err := gcm.Open(nil, nonce, ciphertext, nil)
+	if err != nil {
+		log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring")
+		return
+	}
+
+	var saved []DTNEntryJSON
+	if err := json.Unmarshal(plaintext, &saved); err != nil {
+		log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring")
+		return
+	}
+
+	count := 0
+	for _, j := range saved {
+		// Only restore critical entries — moderate entries are intentionally
+		// not persisted, but this guard defends against format changes.
+		e := entryFromJSON(j)
+		if !e.isEffectivelyCritical() {
+			continue
+		}
+		c.entries = append(c.entries, e)
+		count++
+	}
+	if count > 0 {
+		log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk")
+	}
+}
+
+// ── Retry loop ────────────────────────────────────────────────────────────────
+
+// startDNTLoop runs the background retry goroutine.  Call once after init.
+func (s *StreamService) startDNTLoop() {
+	logger := oclib.GetLogger()
+	ticker := time.NewTicker(DTNRetryInterval)
+	defer ticker.Stop()
+
+	// retryEntries attempts delivery for the given entries and returns those
+	// that must be kept for the next round.
+	retryEntries := func(entries []*DTNEntry) []*DTNEntry {
+		var keep []*DTNEntry
+		for _, e := range entries {
+			_, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto)
+			if err == nil {
+				if e.isEffectivelyCritical() {
+					logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
+						Msg("[dnt] critical message delivered after retry")
+				} else {
+					logger.Info().Str("proto", string(e.proto)).Str("peer", e.did).
+						Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry")
+				}
+				continue
+			}
+			if e.isEffectivelyCritical() {
+				keep = append(keep, e)
+			} else {
+				e.retries++
+				if e.retries < DTNMaxModerateRetries {
+					keep = append(keep, e)
+				} else {
+					logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did).
+						Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries")
+				}
+			}
+		}
+		return keep
+	}
+
+	for {
+		select {
+		case <-ticker.C:
+			entries := s.dnt.drain()
+			if len(entries) == 0 {
+				continue
+			}
+			s.dnt.requeue(retryEntries(entries))
+			go s.dnt.persistToDisk()
+
+		case peerID := <-s.dntNudge:
+			// A peer just signalled it is reachable — retry its entries immediately.
+			entries := s.dnt.drain()
+			var forPeer, other []*DTNEntry
+			for _, e := range entries {
+				if e.did == peerID {
+					forPeer = append(forPeer, e)
+				} else {
+					other = append(other, e)
+				}
+			}
+			kept := retryEntries(forPeer)
+			s.dnt.requeue(append(kept, other...))
+			if len(kept) < len(forPeer) {
+				go s.dnt.persistToDisk()
+			}
+		}
+	}
+}
@@ -15,6 +15,7 @@ import (
 	"cloud.o-forge.io/core/oc-lib/models/resources"
 	"cloud.o-forge.io/core/oc-lib/tools"
 	"github.com/libp2p/go-libp2p/core/network"
+	pp "github.com/libp2p/go-libp2p/core/peer"
 )

 type Verify struct {
@@ -23,8 +24,18 @@ type Verify struct {

 func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s network.Stream) error {
 	fmt.Println("handleEvent", protocol)
-	// Heartbeat received on an outgoing ProtocolObserve stream.
 	if protocol == ProtocolObserve {
+		// Distinguish between an open request and a close request by inspecting
+		// the ObserveRequest payload. The remote wraps both in a common.Event
+		// with Type=ProtocolObserve so the persistent readLoop can decode them.
+		var req ObserveRequest
+		if evt.Payload != nil {
+			json.Unmarshal(evt.Payload, &req) //nolint:errcheck — zero value means open
+		}
+		if req.Close {
+			ps.observeCache.cancel(s.Conn().RemotePeer().String())
+			return nil
+		}
 		return ps.handleIncomingObserve(s)
 	}
 	if protocol == observeHBEventType {
@@ -59,6 +70,11 @@ func (ps *StreamService) handleEvent(protocol string, evt *common.Event, s netwo
 			return err
 		}
 	}
+	if protocol == ProtocolSourcePresignResource {
+		if err := ps.pass(evt, tools.SOURCE_PRESIGN_EVENT); err != nil {
+			return err
+		}
+	}
 	if protocol == ProtocolAdmiraltyConfigResource {
 		if err := ps.pass(evt, tools.ADMIRALTY_CONFIG_EVENT); err != nil {
 			return err
@@ -125,9 +141,9 @@ func (abs *StreamService) sendPlanner(event *common.Event) error { //
 }

 func (abs *StreamService) retrieveResponse(event *common.Event) error { //
-	if !abs.ResourceSearches.IsActive(event.User) {
+	/*if !abs.ResourceSearches.IsActive(event.User) {
 		return nil // search already closed or timed out
-	}
+	}*/
 	res, err := resources.ToResource(int(event.DataType), event.Payload)
 	if err != nil || res == nil {
 		return nil
@@ -137,6 +153,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
 	b, err := json.Marshal(res.Serialize(res))
 	go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
 		FromApp:  "oc-discovery",
+		User:     event.User,
 		Datatype: tools.DataType(event.DataType),
 		Method:   int(tools.SEARCH_EVENT),
 		Payload:  b,
@@ -147,6 +164,7 @@ func (abs *StreamService) retrieveResponse(event *common.Event) error { //
 func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) error { //
 	go tools.NewNATSCaller().SetNATSPub(method, tools.NATSResponse{
 		FromApp:  "oc-discovery",
+		User:     event.User,
 		Datatype: tools.DataType(event.DataType),
 		Method:   int(method),
 		Payload:  event.Payload,
@@ -154,6 +172,36 @@ func (abs *StreamService) pass(event *common.Event, method tools.NATSMethod) err
 	return nil
 }

+// resolveBookingNano does a single DB lookup and returns:
+//
+//	(nil, true)  — not a booking, dest_peer_id absent, or dest == self → process normally, no forward
+//	(nano, true) — dest is one of our NANO peers → process + forward to nano
+//	(nil, false) — dest is unknown → ignore
+func (ps *StreamService) resolveBookingNano(evt *common.Event) (*peer.Peer, bool) {
+	if tools.DataType(evt.DataType) != tools.BOOKING {
+		return nil, true
+	}
+	var b struct {
+		DestPeerID string `json:"dest_peer_id"`
+	}
+	if err := json.Unmarshal(evt.Payload, &b); err != nil || b.DestPeerID == "" {
+		return nil, true
+	}
+	if self, err := oclib.GetMySelf(); err == nil && self != nil && b.DestPeerID == self.GetID() {
+		return nil, true
+	}
+	d := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil).Search(&dbs.Filters{
+		And: map[string][]dbs.Filter{
+			"id":       {{Operator: dbs.EQUAL.String(), Value: b.DestPeerID}},
+			"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
+		},
+	}, "", false, 0, 1)
+	if len(d.Data) == 0 {
+		return nil, false
+	}
+	return d.Data[0].(*peer.Peer), true
+}
+
 func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol string) error {
 	switch protocol {
 	case ProtocolSearchResource:
@@ -176,9 +224,10 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
 				ps.SendResponse(p[0], evt, fmt.Sprintf("%v", search))
 			}
 		} else {
-			fmt.Println("SEND SEARCH_EVENT SetNATSPub", m)
-			go tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
+			fmt.Println("SEND SEARCH_EVENT SetNATSPub", m, evt.DataType, evt.User)
+			tools.NewNATSCaller().SetNATSPub(tools.SEARCH_EVENT, tools.NATSResponse{
 				FromApp:  "oc-discovery",
+				User:     evt.User,
 				Datatype: tools.DataType(evt.DataType),
 				Method:   int(tools.SEARCH_EVENT),
 				Payload:  evt.Payload,
@@ -186,19 +235,35 @@ func (ps *StreamService) handleEventFromPartner(evt *common.Event, protocol stri
 		}
 	case ProtocolCreateResource, ProtocolUpdateResource:
 		fmt.Println("RECEIVED Protocol.Update", string(evt.Payload))
-		go tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
+		nano, ok := ps.resolveBookingNano(evt)
+		if !ok {
+			return nil
+		}
+		tools.NewNATSCaller().SetNATSPub(tools.CREATE_RESOURCE, tools.NATSResponse{
 			FromApp:  "oc-discovery",
+			User:     evt.User,
 			Datatype: tools.DataType(evt.DataType),
 			Method:   int(tools.CREATE_RESOURCE),
 			Payload:  evt.Payload,
 		})
+		if nano != nil {
+			ps.forwardToNano(nano, evt, protocol)
+		}
 	case ProtocolDeleteResource:
-		go tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
+		nano, ok := ps.resolveBookingNano(evt)
+		if !ok {
+			return nil
+		}
+		tools.NewNATSCaller().SetNATSPub(tools.REMOVE_RESOURCE, tools.NATSResponse{
 			FromApp:  "oc-discovery",
+			User:     evt.User,
 			Datatype: tools.DataType(evt.DataType),
 			Method:   int(tools.REMOVE_RESOURCE),
 			Payload:  evt.Payload,
 		})
+		if nano != nil {
+			ps.forwardToNano(nano, evt, protocol)
+		}
 	default:
 		return errors.New("no action authorized available : " + protocol)
 	}
@@ -223,11 +288,31 @@ func (abs *StreamService) SendResponse(p *peer.Peer, event *common.Event, search
 			access := oclib.NewRequestAdmin(oclib.LibDataEnum(dt), nil)
 			searched := access.Search(abs.FilterPeer(self.GetID(), event.Groups, search), "", false, 0, 0)
 			for _, ss := range searched.Data {
+				// SendResponse uses an admin request so SetAllowedInstances
+				// never calls FilterExploitationAuthorizations.  Apply it
+				// explicitly here so we never leak private AEs to a remote peer.
+				if r, ok := ss.(resources.ResourceInterface); ok {
+					r.SetAllowedInstances(&tools.APIRequest{PeerID: p.UUID, Groups: event.Groups, Username: event.User})
+				}
 				if j, err := json.Marshal(ss); err == nil {
 					abs.PublishCommon(&dt, event.User, event.Groups, p.PeerID, ProtocolSearchResource, j)
 				}
 			}
 		}
 	}
+	// Close the ProtocolSearchResource stream to the requester immediately after
+	// sending all results. This prevents TempStream from reusing a stale (already
+	// closed by the remote) stream entry for a subsequent search from the same peer,
+	// which would cause write failure and no results for the second search.
+	if decodedID, err := pp.Decode(p.PeerID); err == nil {
+		abs.Mu.Lock()
+		if abs.Streams[ProtocolSearchResource] != nil {
+			if s, ok := abs.Streams[ProtocolSearchResource][decodedID]; ok {
+				s.Stream.Reset()
+				delete(abs.Streams[ProtocolSearchResource], decodedID)
+			}
+		}
+		abs.Mu.Unlock()
+	}
 	return nil
 }
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
 // observeHBEventType is used as the common.Event.Type for heartbeat responses.
 const observeHBEventType = "/opencloud/peer/observe/heartbeat"

-const observeHBInterval = 30 * time.Second
+const observeHBInterval = 10 * time.Second
 const observeDrainDuration = 30 * time.Second

 // observeBatchWindow is the accumulation window before a heartbeat batch is
@@ -45,7 +45,95 @@ type ObserveRequest struct {

 // ObserveHeartbeat is sent by the observed side every observeHBInterval.
 type ObserveHeartbeat struct {
-	State string `json:"state"` // always "online" when actively emitted
+	State  string    `json:"state"`             // always "online" when actively emitted
+	SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
+}
+
+const (
+	maxLatencyMs      = 2000.0 // ms above which latency score → 0
+	latencySamples    = 5      // sliding window size for latency averaging
+	fastThresholdMs   = 200.0  // below = "fast", above = "slow"
+	reliableThreshold = 0.95   // miss_rate below 5% = "reliable"
+)
+
+// PeerObserveMetrics accumulates connection-quality data for one observed peer.
+// Updated on every incoming heartbeat (observing side).
+type PeerObserveMetrics struct {
+	mu              sync.Mutex
+	firstObservedAt time.Time
+	lastHeartbeatAt time.Time
+	received        uint64
+	latencies       [latencySamples]time.Duration
+	latIdx          int
+	latCount        int
+}
+
+func (m *PeerObserveMetrics) record(latency time.Duration) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.received++
+	m.lastHeartbeatAt = time.Now().UTC()
+	m.latencies[m.latIdx%latencySamples] = latency
+	m.latIdx++
+	if m.latCount < latencySamples {
+		m.latCount++
+	}
+}
+
+func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	var total time.Duration
+	for i := 0; i < m.latCount; i++ {
+		total += m.latencies[i]
+	}
+	var avgMs float64
+	if m.latCount > 0 {
+		avgMs = float64(total.Milliseconds()) / float64(m.latCount)
+	}
+	expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
+	fmt.Println("EXPECTED", expected, m.received)
+	var missRate float64
+	if expected > 0 {
+		recv := int64(m.received)
+		if recv > expected {
+			recv = expected
+		}
+		missRate = 1.0 - float64(recv)/float64(expected)
+	}
+	latScore := 1.0 - avgMs/maxLatencyMs
+	if latScore < 0 {
+		latScore = 0
+	}
+	relScore := 1.0 - missRate
+	trust := (0.35*latScore + 0.65*relScore) * 100
+
+	speed := "fast"
+	if avgMs >= fastThresholdMs {
+		speed = "slow"
+	}
+	reliability := "reliable"
+	if relScore < reliableThreshold {
+		reliability = "watch"
+	}
+	return PeerObserveSnapshot{
+		LatencyMs:   avgMs,
+		Speed:       speed,
+		Reliability: reliability,
+		TrustScore:  trust,
+		LastSeenAt:  m.lastHeartbeatAt,
+		MissRate:    missRate,
+	}
+}
+
+// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
+type PeerObserveSnapshot struct {
+	LatencyMs   float64   `json:"latency_ms"`
+	Speed       string    `json:"speed"`       // "fast" | "slow"
+	Reliability string    `json:"reliability"` // "reliable" | "watch"
+	TrustScore  float64   `json:"trust_score"`
+	LastSeenAt  time.Time `json:"last_seen_at"`
+	MissRate    float64   `json:"miss_rate"`
 }

 // ShallowPeer is the minimal peer representation sent by oc-peer in a
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {

 // ── incoming observe handler (observed side) ──────────────────────────────────

-// handleIncomingObserve is registered as the ProtocolObserve stream handler.
-// It is called when a remote peer opens an observe stream to us.
-// The function reads the request, validates it, then starts (or stops) the
-// heartbeat goroutine and returns immediately — the goroutine owns the stream.
+// handleIncomingObserve is called when a remote peer opens an observe stream
+// to us (observed side). It starts a heartbeat goroutine that writes back on
+// the same bidirectional rawStream — no separate reverse stream is opened.
+// The goroutine stops via context cancellation (triggered by a close event
+// read from rawStream) or when rawStream becomes unwritable.
 func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
 	remotePeerID := rawStream.Conn().RemotePeer().String()
-	addr := rawStream.Conn().RemoteMultiaddr().String()
-	ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
-	if err != nil {
-		fmt.Println("qndlqnl EERR", addr, err)
-		return err
-	}
 	log := oclib.GetLogger()

 	// Drain mode: reject any new observations for 30 s after a close-all.
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
 	draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
 	s.drainMu.RUnlock()
 	if draining {
-		rawStream.Close()
 		fmt.Println("Draining")
-		return errors.New("Draining")
+		return errors.New("draining")
 	}
-	// Read the observe request (with a generous deadline to avoid hangs).
-	// Guard: the requesting peer must not be blacklisted or be ourself.
-	did := ""
+
+	// Guard: the requesting peer must not be blacklisted.
 	access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
 	res := access.Search(&dbs.Filters{
 		And: map[string][]dbs.Filter{
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
 	}, "", false, 0, 1)
 	if len(res.Data) > 0 {
 		p := res.Data[0].(*peer.Peer)
-		did = p.GetID()
-		if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
-			rawStream.Close()
+		if p.Relation == peer.BLACKLIST {
 			fmt.Println("CLOSE blacklist or self")
-			return errors.New("can't exploit blacklist or self")
+			return errors.New("can't observe blacklisted peer")
 		}
 	}

@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
 	s.observeCache.set(remotePeerID, cancel)
 	fmt.Println("LOOP OBSERVE")
 	go func() {
-		defer rawStream.Close()
+		// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
+		// owns rawStream's lifecycle. We only stop writing.
 		defer cancel()
 		defer s.observeCache.delete(remotePeerID)

 		ticker := time.NewTicker(observeHBInterval)
 		defer ticker.Stop()

-		hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
-		evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
-		if evt == nil {
-			return
-		}
-		if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
-			stream := s.Streams[ProtocolObserve][ad.ID]
-			if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
-				// Moderate connectivity event: the observer is unreachable.
-				// The deferred calls above purge this observer from the cache.
-				fmt.Println("LOOP EVT ERR", err)
-				log.Info().
-					Str("observer", remotePeerID).
-					Err(err).
-					Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
-				return
-			}
+		buildHBEvent := func() *common.Event {
+			p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
+			return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
 		}
+
 		for {
 			select {
 			case <-ctx.Done():
 				return
 			case <-ticker.C:
-
 				rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
-				fmt.Println("LOOP EVT", evt)
-				var err error
-				if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
-					stream := s.Streams[ProtocolObserve][ad.ID]
-					if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
-						// Moderate connectivity event: the observer is unreachable.
-						// The deferred calls above purge this observer from the cache.
-						fmt.Println("LOOP EVT ERR", err)
-						log.Info().
-							Str("observer", remotePeerID).
-							Err(err).
-							Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
-						return
-					}
+				evt := buildHBEvent()
+				if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
+					log.Info().
+						Str("observer", remotePeerID).
+						Err(err).
+						Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
+					return
 				}
 				rawStream.SetWriteDeadline(time.Time{})
 			}
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
 // ── heartbeat receiver (observing side) ───────────────────────────────────────

 // handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
-// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
-// accumulator; the batcher flushes to NATS after observeBatchWindow.
+// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
+// a quality snapshot to NATS.
 func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
-	// ps.hbBatcher.add(evt.From)
-	flushObserveBatch([]string{evt.From})
+	var hb ObserveHeartbeat
+	if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
+		latency := time.Since(hb.SentAt)
+		raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
+			firstObservedAt: time.Now().UTC(),
+		})
+		raw.(*PeerObserveMetrics).record(latency)
+		fmt.Println("METRICS", raw)
+		ps.observeMetrics.Store(evt.From, raw)
+	}
+	ps.flushObserveForPeer(evt.From, evt.User)
 	return nil
 }

+// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
+// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
+func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
+	var snap *PeerObserveSnapshot
+	if raw, ok := ps.observeMetrics.Load(peerID); ok {
+		fmt.Println("RETRIEVED METRICS", raw)
+		s := raw.(*PeerObserveMetrics).snapshot()
+		snap = &s
+	}
+	fmt.Println("RETRIEVED METRICS 2", snap)
+	payload, err := json.Marshal(map[string]interface{}{
+		"peer_ids": []string{peerID},
+		"state":    "online",
+		"metrics":  map[string]*PeerObserveSnapshot{peerID: snap},
+	})
+	if err != nil {
+		return
+	}
+	tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
+		FromApp:  "oc-discovery",
+		Datatype: tools.PEER,
+		User:     user,
+		Method:   int(tools.PEER_OBSERVE_RESPONSE_EVENT),
+		Payload:  payload,
+	})
+	propPayload, err := json.Marshal(tools.PropalgationMessage{
+		DataType: int(tools.PEER),
+		Action:   tools.PB_PROPAGATE,
+		Payload:  payload,
+	})
+	if err != nil {
+		return
+	}
+	tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
+		FromApp:  "oc-discovery",
+		Datatype: tools.PEER,
+		User:     user,
+		Method:   int(tools.PROPALGATION_EVENT),
+		Payload:  propPayload,
+	})
+}
+
 // ── user→peer index (ref-counted observe management) ─────────────────────────

 // userPeerIndex tracks which users are observing which peers.
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
 }

 // closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
-// the remote side.
+// the remote side. The close event is wrapped in a common.Event so the remote's
+// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
 func (ps *StreamService) closeObserveStream(toPeerID string) error {
 	decodedID, err := pp.Decode(toPeerID)
 	if err != nil {
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
 	ps.Mu.Lock()
 	if ps.Streams[ProtocolObserve] != nil {
 		if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
-			_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
+			closePayload, _ := json.Marshal(ObserveRequest{Close: true})
+			closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
+			_ = json.NewEncoder(s.Stream).Encode(closeEvt)
 			s.Stream.Close()
 			delete(ps.Streams[ProtocolObserve], decodedID)
 		}
 	}
 	ps.Mu.Unlock()
+	ps.observeMetrics.Delete(toPeerID)
 	return nil
 }

@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
 func (ps *StreamService) CloseAllObserves() {
 	ps.Mu.Lock()
 	for _, s := range ps.Streams[ProtocolObserve] {
-		_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
+		closePayload, _ := json.Marshal(ObserveRequest{Close: true})
+		closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
+		_ = json.NewEncoder(s.Stream).Encode(closeEvt)
 		s.Stream.Close()
 	}
 	delete(ps.Streams, ProtocolObserve)
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {

 	// Reset user index so stale ref-counts don't block future opens.
 	ps.observeUsers = newUserPeerIndex()
+	ps.observeMetrics.Range(func(k, _ any) bool {
+		ps.observeMetrics.Delete(k)
+		return true
+	})

 	ps.drainMu.Lock()
 	ps.drainUntil = time.Now().Add(observeDrainDuration)
@@ -61,15 +61,17 @@ func (ps *StreamService) PublishCommon(dt *tools.DataType, user string, groups [
 		}
 		stream, err := ps.write(toPeerID, ad, dt, user, resource, proto)
 		if err != nil {
-			if _, ok := dntProtocols[proto]; ok {
-				ps.dnt.enqueue(&dntEntry{
-					did:     toPeerID,
-					addr:    *ad,
-					dt:      dt,
-					user:    user,
-					payload: resource,
-					proto:   proto,
-					addedAt: time.Now().UTC(),
+			if _, ok := DTNProtocols[proto]; ok {
+				ps.dnt.enqueue(&DTNEntry{
+					did:           toPeerID,
+					resourceID:    extractResourceID(resource),
+					forceCritical: pe.Relation == peer.NANO,
+					addr:          *ad,
+					dt:            dt,
+					user:          user,
+					payload:       resource,
+					proto:         proto,
+					addedAt:       time.Now().UTC(),
 				})
 			}
 			return nil, err
@@ -125,20 +127,45 @@ func (ps *StreamService) ToPartnerPublishEvent(

 		return nil
 	}
-	ks := []protocol.ID{}
-	for k := range protocolsPartners {
-		ks = append(ks, k)
+	// Extract creator_id to route to the correct nano.
+	// A master must only forward a resource to the nano that owns it.
+	var creatorID string
+	var minPayload struct {
+		CreatorID string `json:"creator_id"`
 	}
-	for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER, peer.NANO} {
+	if json.Unmarshal(payload, &minPayload) == nil {
+		creatorID = minPayload.CreatorID
+	}
+
+	// PARTNER and MASTER receive every resource unconditionally.
+	for _, rel := range []peer.PeerRelation{peer.PARTNER, peer.MASTER} {
 		ps.PublishesCommon(dt, user, groups, &dbs.Filters{
 			And: map[string][]dbs.Filter{
 				"relation": {{Operator: dbs.EQUAL.String(), Value: rel}},
 			},
 		}, payload, proto)
 	}
+
+	// NANO: only send to the nano whose UUID matches the resource creator.
+	if creatorID != "" {
+		ps.PublishesCommon(dt, user, groups, &dbs.Filters{
+			And: map[string][]dbs.Filter{
+				"relation": {{Operator: dbs.EQUAL.String(), Value: peer.NANO}},
+				"id":       {{Operator: dbs.EQUAL.String(), Value: creatorID}},
+			},
+		}, payload, proto)
+	}
 	return nil
 }

+// forwardToNano sends a booking mutation directly to a known NANO peer.
+// The NANO peer is already resolved by the caller (resolveBookingNano).
+// DTN critical is applied automatically by PublishCommon (Relation == NANO).
+func (abs *StreamService) forwardToNano(nano *peer.Peer, evt *common.Event, proto string) {
+	dt := tools.DataType(evt.DataType)
+	abs.PublishCommon(&dt, evt.User, evt.Groups, nano.PeerID, protocol.ID(proto), evt.Payload)
+}
+
 func (s *StreamService) write(
 	did string,
 	peerID *pp.AddrInfo,
@@ -27,6 +27,10 @@ const ProtocolConsidersResource = "/opencloud/resource/considers/1.0"
 const ProtocolMinioConfigResource = "/opencloud/minio/config/1.0"
 const ProtocolAdmiraltyConfigResource = "/opencloud/admiralty/config/1.0"

+// ProtocolSourcePresignResource routes PB_SOURCE_PRESIGN to the resource-owner peer.
+// The owner generates a pre-signed Minio URL and responds via PB_CONSIDERS.
+const ProtocolSourcePresignResource = "/opencloud/resource/source-presign/1.0"
+
 const ProtocolSearchResource = "/opencloud/resource/search/1.0"
 const ProtocolCreateResource = "/opencloud/resource/create/1.0"
 const ProtocolUpdateResource = "/opencloud/resource/update/1.0"
@@ -43,6 +47,7 @@ var protocols = map[protocol.ID]*common.ProtocolInfo{
 	ProtocolVerifyResource:          {WaitResponse: true, TTL: 1 * time.Minute},
 	ProtocolMinioConfigResource:     {WaitResponse: true, TTL: 1 * time.Minute},
 	ProtocolAdmiraltyConfigResource: {WaitResponse: true, TTL: 1 * time.Minute},
+	ProtocolSourcePresignResource:   {WaitResponse: true, TTL: 1 * time.Minute},
 	ProtocolObserve:                 {WaitResponse: true, TTL: 1 * time.Minute},
 }

@@ -63,8 +68,8 @@ type StreamService struct {
 	// IsPeerKnown, when set, is called at stream open for every inbound protocol.
 	// Return false to reset the stream immediately. Left nil until wired by the node.
 	IsPeerKnown func(pid pp.ID) bool
-	// dnt is the Disconnection Network Tolerance cache for outbound streams.
-	dnt *dntCache
+	// DTN is the Disconnection Network Tolerance cache for outbound streams.
+	dnt *DTNCache
 	// observeCache tracks running heartbeat goroutines on the OBSERVED side.
 	observeCache *observeCache
 	// hbBatcher accumulates incoming heartbeats (observing side) and flushes
@@ -78,6 +83,12 @@ type StreamService struct {
 	// observeUsers tracks which users are observing which peers so streams are
 	// closed only when the last observer for a peer disconnects.
 	observeUsers *userPeerIndex
+	// observeMetrics accumulates connection-quality data per observed peer (observing side).
+	// Keys are peer_id strings; values are *PeerObserveMetrics.
+	observeMetrics sync.Map
+	// DTNNudge receives peer IDs for which an immediate DTN retry should be
+	// attempted (e.g. when the peer just reconnected via PendingCallers).
+	dntNudge chan string
 }

 func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node common.DiscoveryPeer) (*StreamService, error) {
@@ -92,6 +103,7 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
 		dnt:              newDNTCache(),
 		observeCache:     newObserveCache(),
 		observeUsers:     newUserPeerIndex(),
+		dntNudge:         make(chan string, 32),
 	}
 	service.hbBatcher = newHeartbeatBatcher(flushObserveBatch)
 	for proto := range protocols {
@@ -105,6 +117,23 @@ func InitStream(ctx context.Context, h host.Host, key pp.ID, maxNode int, node c
 	return service, nil
 }

+// PendingContacts returns the peer IDs that have at least one critical DTN
+// entry pending.  Called on each heartbeat tick to populate PendingContact.
+func (s *StreamService) PendingContacts() []string {
+	return s.dnt.peersWithPending()
+}
+
+// NudgeContacts signals the DTN loop to retry immediately for the given peer
+// IDs (typically received via HeartbeatResponse.PendingCallers).
+func (s *StreamService) NudgeContacts(peerIDs []string) {
+	for _, id := range peerIDs {
+		select {
+		case s.dntNudge <- id:
+		default:
+		}
+	}
+}
+
 // gate wraps a stream handler with IsPeerKnown validation.
 // If the peer is unknown the entire connection is closed and the handler is not called.
 // IsPeerKnown is read at stream-open time so it works even when set after InitStream.
@@ -117,6 +146,17 @@ func (s *StreamService) gatePrivilege(h func(network.Stream)) func(network.Strea
 				},
 			}, "", false, 0, 1)
 			if len(d.Data) == 0 {
+				stream.Reset()
+				return
+			}
+			master := d.Data[0].(*peer.Peer)
+			if stream.Conn().RemotePeer().String() != master.PeerID {
+				logger := oclib.GetLogger()
+				logger.Warn().
+					Str("remote", stream.Conn().RemotePeer().String()).
+					Str("master", master.PeerID).
+					Msg("[gate] nano rejected stream from non-master peer")
+				stream.Reset()
 				return
 			}
 		}
@@ -162,9 +202,17 @@ func (s *StreamService) HandleResponse(stream network.Stream) {
 		Stream: stream,
 		Expiry: time.Now().UTC().Add(expiry + 1*time.Minute),
 	}
+
+	// ProtocolObserve uses a bidirectional long-lived stream: the remote writes
+	// heartbeats back on the same stream, and may later send a close event.
+	// Use a persistent readLoop so we can receive both heartbeats and close events.
+	protoInfo := protocols[stream.Protocol()]
+	if stream.Protocol() == ProtocolObserve {
+		protoInfo = &common.ProtocolInfo{PersistantStream: true}
+	}
 	go s.readLoop(s.Streams[stream.Protocol()][stream.Conn().RemotePeer()],
 		stream.Conn().RemotePeer(),
-		stream.Protocol(), protocols[stream.Protocol()])
+		stream.Protocol(), protoInfo)
 }

 func (s *StreamService) connectToPartners() error {