package stream // DTN_cache.go — Disconnection Network Tolerance cache for outbound stream requests. // // When a stream write fails because the remote peer is unreachable, the request // is saved here and retried on the next tick. Two levels are defined: // // - DTNCritical : retry indefinitely (create / update / delete resource). // - DTNModerate : up to DTNMaxModerateRetries retries, then abandon. // // Pubsub messages and search streams are explicitly excluded. // Streams initiated from the indexer side are never enqueued here. // // # Crash-resilient persistence // // Critical entries are written to an encrypted file (AES-256-GCM) so they // survive a node crash/restart. The AES key is derived deterministically from // the node's Ed25519 private key via HKDF-SHA256 — no extra secret to manage. // Moderate entries are intentionally not persisted: their retry budget is small // enough that re-loading them after a restart would be misleading. import ( "crypto/aes" "crypto/cipher" "crypto/rand" "crypto/sha256" "encoding/json" "io" "os" "path/filepath" "sync" "time" oclib "cloud.o-forge.io/core/oc-lib" "cloud.o-forge.io/core/oc-lib/tools" "golang.org/x/crypto/hkdf" "oc-discovery/conf" pp "github.com/libp2p/go-libp2p/core/peer" "github.com/libp2p/go-libp2p/core/protocol" ) type DTNLevel int const ( DTNCritical DTNLevel = iota // retry until the message is delivered DTNModerate // retry up to DTNMaxModerateRetries times ) const DTNMaxModerateRetries = 3 const DTNRetryInterval = 15 * time.Second // DTNProtocols maps each stream protocol to its DTN level. // Protocols absent from this map receive no caching (e.g. ProtocolSearchResource). var DTNProtocols = map[protocol.ID]DTNLevel{ // Critical — data mutations that must eventually be delivered. ProtocolCreateResource: DTNCritical, ProtocolUpdateResource: DTNCritical, ProtocolDeleteResource: DTNCritical, // Moderate — confirmations / config / planner: 3 retries before abandon. ProtocolVerifyResource: DTNModerate, ProtocolSendPlanner: DTNModerate, ProtocolConsidersResource: DTNModerate, ProtocolMinioConfigResource: DTNModerate, ProtocolAdmiraltyConfigResource: DTNModerate, ProtocolSourcePresignResource: DTNModerate, } // DTNEntryJSON is the on-disk representation of a DTNEntry. // pp.AddrInfo and protocol.ID don't have built-in JSON tags so we flatten them. type DTNEntryJSON struct { DID string `json:"did"` ResourceID string `json:"resource_id,omitempty"` ForceCritical bool `json:"force_critical,omitempty"` Addr pp.AddrInfo `json:"addr"` DT *tools.DataType `json:"dt,omitempty"` User string `json:"user"` Payload []byte `json:"payload"` Proto protocol.ID `json:"proto"` Retries int `json:"retries"` AddedAt time.Time `json:"added_at"` } type DTNEntry struct { did string resourceID string // UUID of the resource; empty for non-resource payloads (planner, config) forceCritical bool // true when destination is NANO: all protocols become critical addr pp.AddrInfo dt *tools.DataType user string payload []byte proto protocol.ID retries int addedAt time.Time } // isEffectivelyCritical returns true when the entry must be retried indefinitely, // either because its protocol is inherently critical or because the destination // is a NANO peer (forceCritical). func (e *DTNEntry) isEffectivelyCritical() bool { return DTNProtocols[e.proto] == DTNCritical || e.forceCritical } func (e *DTNEntry) toJSON() DTNEntryJSON { return DTNEntryJSON{ DID: e.did, ResourceID: e.resourceID, ForceCritical: e.forceCritical, Addr: e.addr, DT: e.dt, User: e.user, Payload: e.payload, Proto: e.proto, Retries: e.retries, AddedAt: e.addedAt, } } func entryFromJSON(j DTNEntryJSON) *DTNEntry { return &DTNEntry{ did: j.DID, resourceID: j.ResourceID, forceCritical: j.ForceCritical, addr: j.Addr, dt: j.DT, user: j.User, payload: j.Payload, proto: j.Proto, retries: j.Retries, addedAt: j.AddedAt, } } type DTNCache struct { mu sync.Mutex entries []*DTNEntry // aesKey is the derived AES-256 key used for on-disk encryption. // Nil when key derivation failed: persistence is disabled but the in-memory // cache continues to function normally. aesKey []byte } // newDNTCache initialises the cache, derives the encryption key, and restores // any critical entries that were persisted before the last crash. func newDNTCache() *DTNCache { log := oclib.GetLogger() c := &DTNCache{} key, err := deriveDNTKey() if err != nil { log.Warn().Err(err).Msg("[dnt] key derivation failed — persistence disabled") } else { c.aesKey = key c.loadFromDisk() } return c } // extractResourceID returns the "id" field from a JSON resource payload. // Returns "" when the payload is not a resource object (planner, config, etc.). func extractResourceID(payload []byte) string { var obj struct { ID string `json:"id"` } if err := json.Unmarshal(payload, &obj); err != nil { return "" } return obj.ID } // enqueue adds an entry to the cache, respecting the resource lifecycle. // Deduplication key is (did, resourceID): same resource to the same peer keeps // only the latest mutation. resourceID is empty for non-resource payloads // (planner, config), in which case deduplication falls back to did alone. // // - DELETE is terminal: any subsequent mutation on the same key is discarded. // - UPDATE cannot be followed by CREATE: the resource already exists remotely. // - All other cases replace the existing entry (newer mutation supersedes). func (c *DTNCache) enqueue(e *DTNEntry) { c.mu.Lock() found, mutated := false, false for i, existing := range c.entries { if existing.did != e.did || existing.resourceID != e.resourceID { continue } found = true if existing.proto == ProtocolDeleteResource || (existing.proto == ProtocolUpdateResource && e.proto == ProtocolCreateResource) { break // discard new entry silently — existing state is authoritative } c.entries[i] = e mutated = true break } if !found { c.entries = append(c.entries, e) mutated = true } c.mu.Unlock() if mutated && e.isEffectivelyCritical() { go c.persistToDisk() } } // peersWithPending returns the distinct peer IDs (did) that have at least one // critical entry in the cache. Used to populate Heartbeat.PendingContact. func (c *DTNCache) peersWithPending() []string { c.mu.Lock() defer c.mu.Unlock() seen := map[string]struct{}{} var out []string for _, e := range c.entries { if e.isEffectivelyCritical() { if _, ok := seen[e.did]; !ok { seen[e.did] = struct{}{} out = append(out, e.did) } } } return out } // drain atomically removes and returns all current entries. func (c *DTNCache) drain() []*DTNEntry { c.mu.Lock() defer c.mu.Unlock() out := c.entries c.entries = nil return out } // requeue puts entries back at the head of the list, preserving any new // entries added while the retry loop was running. func (c *DTNCache) requeue(entries []*DTNEntry) { if len(entries) == 0 { return } c.mu.Lock() defer c.mu.Unlock() c.entries = append(entries, c.entries...) } // ── Persistence ────────────────────────────────────────────────────────────── // DTNCachePath returns the path of the on-disk cache file, placed next to the // node's private key so it lives on the same persistent volume. func DTNCachePath() string { return filepath.Join(filepath.Dir(conf.GetConfig().PrivateKeyPath), "dnt_cache.bin") } // deriveDNTKey derives a 32-byte AES key from the node's Ed25519 private key // using HKDF-SHA256. The derivation is deterministic: the same key is always // produced from the same private key, so no symmetric secret needs storing. func deriveDNTKey() ([]byte, error) { priv, err := tools.LoadKeyFromFilePrivate() if err != nil { return nil, err } // Raw() on a libp2p Ed25519 private key returns the 64-byte representation // (32-byte seed || 32-byte public key). We use the full 64 bytes as IKM. raw, err := priv.Raw() if err != nil { return nil, err } reader := hkdf.New(sha256.New, raw, nil, []byte("oc-discovery/dnt-cache/v1")) key := make([]byte, 32) if _, err := io.ReadFull(reader, key); err != nil { return nil, err } return key, nil } // persistToDisk encrypts all current critical entries and writes them to disk. // Non-critical entries are deliberately excluded — they are not worth restoring // after a restart given their limited retry budget. func (c *DTNCache) persistToDisk() { if c.aesKey == nil { return } log := oclib.GetLogger() c.mu.Lock() var toSave []DTNEntryJSON for _, e := range c.entries { if e.isEffectivelyCritical() { toSave = append(toSave, e.toJSON()) } } c.mu.Unlock() plaintext, err := json.Marshal(toSave) if err != nil { return } block, err := aes.NewCipher(c.aesKey) if err != nil { return } gcm, err := cipher.NewGCM(block) if err != nil { return } nonce := make([]byte, gcm.NonceSize()) if _, err := io.ReadFull(rand.Reader, nonce); err != nil { return } ciphertext := gcm.Seal(nonce, nonce, plaintext, nil) path := DTNCachePath() tmp := path + ".tmp" if err := os.WriteFile(tmp, ciphertext, 0600); err != nil { log.Warn().Err(err).Msg("[dnt] failed to write cache file") return } if err := os.Rename(tmp, path); err != nil { log.Warn().Err(err).Msg("[dnt] failed to rename cache file") _ = os.Remove(tmp) } } // loadFromDisk decrypts the on-disk cache and re-enqueues only critical entries. // Errors (missing file, decryption failure) are non-fatal: the cache simply // starts empty, which is safe. func (c *DTNCache) loadFromDisk() { if c.aesKey == nil { return } log := oclib.GetLogger() path := DTNCachePath() data, err := os.ReadFile(path) if err != nil { if !os.IsNotExist(err) { log.Warn().Err(err).Msg("[dnt] failed to read cache file") } return } block, err := aes.NewCipher(c.aesKey) if err != nil { return } gcm, err := cipher.NewGCM(block) if err != nil { return } if len(data) < gcm.NonceSize() { log.Warn().Msg("[dnt] cache file too short, ignoring") return } nonce, ciphertext := data[:gcm.NonceSize()], data[gcm.NonceSize():] plaintext, err := gcm.Open(nil, nonce, ciphertext, nil) if err != nil { log.Warn().Err(err).Msg("[dnt] cache file decryption failed (key mismatch?), ignoring") return } var saved []DTNEntryJSON if err := json.Unmarshal(plaintext, &saved); err != nil { log.Warn().Err(err).Msg("[dnt] cache file unmarshal failed, ignoring") return } count := 0 for _, j := range saved { // Only restore critical entries — moderate entries are intentionally // not persisted, but this guard defends against format changes. e := entryFromJSON(j) if !e.isEffectivelyCritical() { continue } c.entries = append(c.entries, e) count++ } if count > 0 { log.Info().Int("count", count).Msg("[dnt] restored critical entries from disk") } } // ── Retry loop ──────────────────────────────────────────────────────────────── // startDNTLoop runs the background retry goroutine. Call once after init. func (s *StreamService) startDNTLoop() { logger := oclib.GetLogger() ticker := time.NewTicker(DTNRetryInterval) defer ticker.Stop() // retryEntries attempts delivery for the given entries and returns those // that must be kept for the next round. retryEntries := func(entries []*DTNEntry) []*DTNEntry { var keep []*DTNEntry for _, e := range entries { _, err := s.write(e.did, &e.addr, e.dt, e.user, e.payload, e.proto) if err == nil { if e.isEffectivelyCritical() { logger.Info().Str("proto", string(e.proto)).Str("peer", e.did). Msg("[dnt] critical message delivered after retry") } else { logger.Info().Str("proto", string(e.proto)).Str("peer", e.did). Int("retries", e.retries).Msg("[dnt] moderate message delivered after retry") } continue } if e.isEffectivelyCritical() { keep = append(keep, e) } else { e.retries++ if e.retries < DTNMaxModerateRetries { keep = append(keep, e) } else { logger.Warn().Str("proto", string(e.proto)).Str("peer", e.did). Int("retries", e.retries).Msg("[dnt] moderate message abandoned after max retries") } } } return keep } for { select { case <-ticker.C: entries := s.dnt.drain() if len(entries) == 0 { continue } s.dnt.requeue(retryEntries(entries)) go s.dnt.persistToDisk() case peerID := <-s.dntNudge: // A peer just signalled it is reachable — retry its entries immediately. entries := s.dnt.drain() var forPeer, other []*DTNEntry for _, e := range entries { if e.did == peerID { forPeer = append(forPeer, e) } else { other = append(other, e) } } kept := retryEntries(forPeer) s.dnt.requeue(append(kept, other...)) if len(kept) < len(forPeer) { go s.dnt.persistToDisk() } } } }