oc-discovery -> conf

This commit is contained in:
mr
2026-04-08 10:04:41 +02:00
parent 46dee0a6cb
commit 29b26d366e
21 changed files with 1934 additions and 119 deletions

View File

@@ -6,6 +6,7 @@ import (
"time"
"oc-discovery/conf"
"oc-discovery/daemons/node/common"
pp "github.com/libp2p/go-libp2p/core/peer"
)
@@ -13,62 +14,14 @@ import (
// ── defaults ──────────────────────────────────────────────────────────────────
const (
defaultMaxConnPerWindow = 20
defaultConnWindowSecs = 30
defaultMaxHBPerMinute = 5
defaultMaxPublishPerMin = 10
defaultMaxGetPerMin = 50
strikeThreshold = 3
banDuration = 10 * time.Minute
behaviorWindowDur = 60 * time.Second
defaultMaxHBPerMinute = 5
defaultMaxPublishPerMin = 10
defaultMaxGetPerMin = 50
strikeThreshold = 3
banDuration = 10 * time.Minute
behaviorWindowDur = 60 * time.Second
)
func cfgOr(v, def int) int {
if v > 0 {
return v
}
return def
}
// ── ConnectionRateGuard ───────────────────────────────────────────────────────
// ConnectionRateGuard limits the number of NEW incoming connections accepted
// within a sliding time window. It protects public indexers against coordinated
// registration floods (Sybil bursts).
type ConnectionRateGuard struct {
mu sync.Mutex
window []time.Time
maxInWindow int
windowDur time.Duration
}
func newConnectionRateGuard() *ConnectionRateGuard {
cfg := conf.GetConfig()
return &ConnectionRateGuard{
maxInWindow: cfgOr(cfg.MaxConnPerWindow, defaultMaxConnPerWindow),
windowDur: time.Duration(cfgOr(cfg.ConnWindowSecs, defaultConnWindowSecs)) * time.Second,
}
}
// Allow returns true if a new connection may be accepted.
// The internal window is pruned on each call so memory stays bounded.
func (g *ConnectionRateGuard) Allow() bool {
g.mu.Lock()
defer g.mu.Unlock()
now := time.Now()
cutoff := now.Add(-g.windowDur)
i := 0
for i < len(g.window) && g.window[i].Before(cutoff) {
i++
}
g.window = g.window[i:]
if len(g.window) >= g.maxInWindow {
return false
}
g.window = append(g.window, now)
return true
}
// ── per-node state ────────────────────────────────────────────────────────────
type nodeBehavior struct {
@@ -130,9 +83,9 @@ func newNodeBehaviorTracker() *NodeBehaviorTracker {
cfg := conf.GetConfig()
return &NodeBehaviorTracker{
nodes: make(map[pp.ID]*nodeBehavior),
maxHB: cfgOr(cfg.MaxHBPerMinute, defaultMaxHBPerMinute),
maxPub: cfgOr(cfg.MaxPublishPerMinute, defaultMaxPublishPerMin),
maxGet: cfgOr(cfg.MaxGetPerMinute, defaultMaxGetPerMin),
maxHB: common.CfgOr(cfg.MaxHBPerMinute, defaultMaxHBPerMinute),
maxPub: common.CfgOr(cfg.MaxPublishPerMinute, defaultMaxPublishPerMin),
maxGet: common.CfgOr(cfg.MaxGetPerMinute, defaultMaxGetPerMin),
}
}

View File

@@ -21,21 +21,37 @@ import (
lpp "github.com/libp2p/go-libp2p/core/peer"
)
// DefaultTTLSeconds is the default TTL for peer records when the publisher
// does not declare a custom TTL. Exported so the node package can reference it.
const DefaultTTLSeconds = 120
// maxTTLSeconds caps how far in the future a publisher can set their ExpiryDate.
const maxTTLSeconds = 86400 // 24h
// tombstoneTTL is how long a signed delete record stays alive in the DHT —
// long enough to propagate everywhere, short enough not to linger forever.
const tombstoneTTL = 10 * time.Minute
type PeerRecordPayload struct {
Name string `json:"name"`
DID string `json:"did"`
PubKey []byte `json:"pub_key"`
ExpiryDate time.Time `json:"expiry_date"`
// TTLSeconds is the publisher's declared lifetime for this record in seconds.
// 0 means "use the default (120 s)". Included in the signed payload so it
// cannot be altered by an intermediary.
TTLSeconds int `json:"ttl_seconds,omitempty"`
}
type PeerRecord struct {
PeerRecordPayload
PeerID string `json:"peer_id"`
APIUrl string `json:"api_url"`
StreamAddress string `json:"stream_address"`
NATSAddress string `json:"nats_address"`
WalletAddress string `json:"wallet_address"`
Signature []byte `json:"signature"`
PeerID string `json:"peer_id"`
APIUrl string `json:"api_url"`
StreamAddress string `json:"stream_address"`
NATSAddress string `json:"nats_address"`
WalletAddress string `json:"wallet_address"`
Location *pp.PeerLocation `json:"location,omitempty"`
Signature []byte `json:"signature"`
}
func (p *PeerRecord) Sign() error {
@@ -84,6 +100,7 @@ func (pr *PeerRecord) ExtractPeer(ourkey string, key string, pubKey crypto.PubKe
StreamAddress: pr.StreamAddress,
NATSAddress: pr.NATSAddress,
WalletAddress: pr.WalletAddress,
Location: pr.Location,
}
if time.Now().UTC().After(pr.ExpiryDate) {
return pp.SELF == p.Relation, nil, errors.New("peer " + key + " is offline")
@@ -91,6 +108,42 @@ func (pr *PeerRecord) ExtractPeer(ourkey string, key string, pubKey crypto.PubKe
return pp.SELF == p.Relation, p, nil
}
// TombstonePayload is the signed body of a delete request.
// Only the owner's private key can produce a valid signature over this payload.
type TombstonePayload struct {
DID string `json:"did"`
PeerID string `json:"peer_id"`
DeletedAt time.Time `json:"deleted_at"`
}
// TombstoneRecord is stored in the DHT at /node/{DID} to signal that a peer
// has voluntarily left the network. The Tombstone bool field acts as a
// discriminator so validators can distinguish it from a live PeerRecord.
type TombstoneRecord struct {
TombstonePayload
PubKey []byte `json:"pub_key"`
Tombstone bool `json:"tombstone"`
Signature []byte `json:"signature"`
}
func (ts *TombstoneRecord) Verify() (crypto.PubKey, error) {
pubKey, err := crypto.UnmarshalPublicKey(ts.PubKey)
if err != nil {
return nil, err
}
payload, _ := json.Marshal(ts.TombstonePayload)
if ok, _ := pubKey.Verify(payload, ts.Signature); !ok {
return nil, errors.New("invalid tombstone signature")
}
return pubKey, nil
}
// isTombstone returns true if data is a valid, well-formed TombstoneRecord.
func isTombstone(data []byte) bool {
var ts TombstoneRecord
return json.Unmarshal(data, &ts) == nil && ts.Tombstone
}
type GetValue struct {
Key string `json:"key"`
PeerID string `json:"peer_id,omitempty"`
@@ -147,9 +200,9 @@ func (ix *IndexerService) isPeerKnown(pid lpp.ID) bool {
return false
}
ctx2, cancel2 := context.WithTimeout(context.Background(), 3*time.Second)
_, err = ix.DHT.GetValue(ctx2, ix.genKey(string(did)))
val, err := ix.DHT.GetValue(ctx2, ix.genKey(string(did)))
cancel2()
return err == nil
return err == nil && !isTombstone(val)
}
func (ix *IndexerService) initNodeHandler() {
@@ -188,6 +241,18 @@ func (ix *IndexerService) initNodeHandler() {
logger.Warn().Err(err).Str("did", rec.DID).Msg("indexer: heartbeat record signature invalid")
return
}
// Don't republish if a tombstone was recently stored for this DID:
// the peer explicitly left and we must not re-animate their record.
ix.deletedDIDsMu.Lock()
if t, ok := ix.deletedDIDs[rec.DID]; ok {
if time.Since(t) < tombstoneTTL {
ix.deletedDIDsMu.Unlock()
return
}
// tombstoneTTL elapsed — peer is allowed to re-register.
delete(ix.deletedDIDs, rec.DID)
}
ix.deletedDIDsMu.Unlock()
// Keep StreamRecord.Record in sync so BuildHeartbeatResponse always
// sees a populated PeerRecord (Name, DID, etc.) regardless of whether
// handleNodePublish ran before or after the heartbeat stream was opened.
@@ -220,6 +285,8 @@ func (ix *IndexerService) initNodeHandler() {
ix.Host.SetStreamHandler(common.ProtocolHeartbeat, ix.HandleHeartbeat)
ix.Host.SetStreamHandler(common.ProtocolPublish, ix.handleNodePublish)
ix.Host.SetStreamHandler(common.ProtocolGet, ix.handleNodeGet)
ix.Host.SetStreamHandler(common.ProtocolDelete, ix.handleNodeDelete)
ix.Host.SetStreamHandler(common.ProtocolIndirectProbe, ix.handleIndirectProbe)
ix.Host.SetStreamHandler(common.ProtocolIndexerCandidates, ix.handleCandidateRequest)
ix.initSearchHandlers()
}
@@ -383,12 +450,12 @@ func (ix *IndexerService) handleNodeGet(s network.Stream) {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
c, err := ix.DHT.GetValue(ctx, ix.genKey(key))
cancel()
if err == nil {
if err == nil && !isTombstone(c) {
var rec PeerRecord
if json.Unmarshal(c, &rec) == nil {
resp.Records[rec.PeerID] = rec
}
} else {
} else if err != nil {
logger.Err(err).Msg("Failed to fetch PeerRecord from DHT " + key)
}
}
@@ -399,3 +466,121 @@ func (ix *IndexerService) handleNodeGet(s network.Stream) {
}
}
// handleNodeDelete processes a signed delete (tombstone) request from a peer.
// It verifies that the request is:
// - marked as a tombstone
// - recent (within 5 minutes, preventing replay attacks)
// - sent by the actual peer whose record is being deleted (PeerID == remotePeer)
// - signed by the matching private key
//
// On success it stores the tombstone in the DHT, evicts the peer from the local
// stream records, and marks the DID in deletedDIDs so AfterHeartbeat cannot
// accidentally republish the record during the tombstoneTTL window.
func (ix *IndexerService) handleNodeDelete(s network.Stream) {
defer s.Close()
logger := oclib.GetLogger()
remotePeer := s.Conn().RemotePeer()
s.SetDeadline(time.Now().Add(10 * time.Second))
var ts TombstoneRecord
if err := json.NewDecoder(s).Decode(&ts); err != nil || !ts.Tombstone {
s.Reset()
return
}
if ts.PeerID == "" || ts.DID == "" {
s.Reset()
return
}
if time.Since(ts.DeletedAt) > 5*time.Minute {
logger.Warn().Str("peer", remotePeer.String()).Msg("[delete] stale tombstone rejected")
s.Reset()
return
}
if ts.PeerID != remotePeer.String() {
logger.Warn().Str("peer", remotePeer.String()).Msg("[delete] tombstone PeerID mismatch")
s.Reset()
return
}
if _, err := ts.Verify(); err != nil {
logger.Warn().Err(err).Str("peer", remotePeer.String()).Msg("[delete] invalid tombstone signature")
s.Reset()
return
}
// Mark DID as deleted in-memory before writing to DHT so AfterHeartbeat
// cannot win a race and republish the live record on top of the tombstone.
ix.deletedDIDsMu.Lock()
ix.deletedDIDs[ts.DID] = ts.DeletedAt
ix.deletedDIDsMu.Unlock()
data, _ := json.Marshal(ts)
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx, ix.genKey(ts.DID), data); err != nil {
logger.Warn().Err(err).Str("did", ts.DID).Msg("[delete] DHT write tombstone failed")
}
cancel()
// Invalidate the /pid/ secondary index so isPeerKnown returns false quickly.
ctx2, cancel2 := context.WithTimeout(context.Background(), 10*time.Second)
if err := ix.DHT.PutValue(ctx2, ix.genPIDKey(ts.PeerID), []byte("")); err != nil {
logger.Warn().Err(err).Str("pid", ts.PeerID).Msg("[delete] DHT clear pid failed")
}
cancel2()
// Evict from active stream records.
if pid, err := lpp.Decode(ts.PeerID); err == nil {
ix.StreamMU.Lock()
delete(ix.StreamRecords[common.ProtocolHeartbeat], pid)
ix.StreamMU.Unlock()
}
logger.Info().Str("did", ts.DID).Str("peer", ts.PeerID).Msg("[delete] tombstone stored, peer evicted")
}
// handleIndirectProbe is the SWIM inter-indexer probe handler.
// A node opens this stream toward a live indexer to ask: "can you reach peer X?"
// The indexer attempts a ProtocolBandwidthProbe to X and reports back.
// This is the only protocol that indexers use to communicate with each other;
// no persistent inter-indexer connections are maintained.
func (ix *IndexerService) handleIndirectProbe(s network.Stream) {
defer s.Close()
s.SetDeadline(time.Now().Add(10 * time.Second))
var req common.IndirectProbeRequest
if err := json.NewDecoder(s).Decode(&req); err != nil {
s.Reset()
return
}
respond := func(reachable bool, latencyMs int64) {
json.NewEncoder(s).Encode(common.IndirectProbeResponse{
Reachable: reachable,
LatencyMs: latencyMs,
})
}
// Connect to target if not already connected.
ctx, cancel := context.WithTimeout(context.Background(), 6*time.Second)
defer cancel()
if ix.Host.Network().Connectedness(req.Target.ID) != network.Connected {
if err := ix.Host.Connect(ctx, req.Target); err != nil {
respond(false, 0)
return
}
}
// Open a bandwidth probe stream — already registered on all nodes/indexers.
start := time.Now()
ps, err := ix.Host.NewStream(ctx, req.Target.ID, common.ProtocolBandwidthProbe)
if err != nil {
respond(false, 0)
return
}
defer ps.Reset()
ps.SetDeadline(time.Now().Add(3 * time.Second))
ps.Write([]byte("ping"))
buf := make([]byte, 4)
_, err = ps.Read(buf)
latency := time.Since(start).Milliseconds()
respond(err == nil, latency)
}

View File

@@ -9,6 +9,7 @@ import (
"oc-discovery/daemons/node/common"
"strings"
"sync"
"sync/atomic"
"time"
oclib "cloud.o-forge.io/core/oc-lib"
@@ -30,9 +31,9 @@ type dhtCacheEntry struct {
// SuggestMigrate to a small batch at a time; peers that don't migrate within
// offloadGracePeriod are moved to alreadyTried so a new batch can be picked.
type offloadState struct {
inBatch map[pp.ID]time.Time // peer → time added to current batch
inBatch map[pp.ID]time.Time // peer → time added to current batch
alreadyTried map[pp.ID]struct{} // peers proposed to that didn't migrate
mu sync.Mutex
mu sync.Mutex
}
const (
@@ -64,9 +65,20 @@ type IndexerService struct {
pendingSearchesMu sync.Mutex
// behavior tracks per-node compliance (heartbeat rate, publish/get volume,
// identity consistency, signature failures).
behavior *NodeBehaviorTracker
behavior *NodeBehaviorTracker
// connGuard limits new-connection bursts to protect public indexers.
connGuard *ConnectionRateGuard
// deletedDIDs tracks recently tombstoned DIDs to prevent AfterHeartbeat
// from republishing records that were explicitly deleted by the peer.
// Entries are cleared automatically after tombstoneTTL.
deletedDIDs map[string]time.Time
deletedDIDsMu sync.RWMutex
// SWIM incarnation: incremented when a connecting node signals suspicion via
// SuspectedIncarnation. The new value is broadcast back so nodes can clear
// their suspect state (refutation mechanism).
incarnation atomic.Uint64
// eventQueue holds SWIM membership events to be piggybacked on responses
// (infection-style dissemination toward connected nodes).
eventQueue *common.MembershipEventQueue
}
// NewIndexerService creates an IndexerService.
@@ -81,7 +93,8 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
referencedNodes: map[pp.ID]PeerRecord{},
pendingSearches: map[string]chan []common.SearchHit{},
behavior: newNodeBehaviorTracker(),
connGuard: newConnectionRateGuard(),
deletedDIDs: make(map[string]time.Time),
eventQueue: &common.MembershipEventQueue{},
}
if ps == nil {
ps, err = pubsub.NewGossipSub(context.Background(), ix.Host)
@@ -96,6 +109,21 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
common.ConnectToIndexers(h, conf.GetConfig().MinIndexer, conf.GetConfig().MaxIndexer*2)
logger.Info().Msg("subscribe to decentralized search flow as strict indexer...")
go ix.SubscribeToSearch(ix.PS, nil)
ix.AllowInbound = func(remotePeer pp.ID, isNew bool) error {
/*if ix.behavior.IsBanned(remotePeer) {
return errors.New("peer is banned")
}*/
if isNew {
// DB blacklist check: blocks reconnection after EvictPeer + blacklist.
/*if !ix.isPeerKnown(remotePeer) {
return errors.New("peer is blacklisted or unknown")
}*/
if !ix.ConnGuard.Allow() {
return errors.New("connection rate limit exceeded, retry later")
}
}
return nil
}
}
ix.LongLivedStreamRecordedService.AfterDelete = func(pid pp.ID, name, did string) {
@@ -106,16 +134,7 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
// AllowInbound: fired once per stream open, before any heartbeat is decoded.
// 1. Reject peers that are currently banned (behavioral strikes).
// 2. For genuinely new connections, apply the burst guard.
ix.AllowInbound = func(remotePeer pp.ID, isNew bool) error {
if ix.behavior.IsBanned(remotePeer) {
return errors.New("peer is banned")
}
if isNew && !ix.connGuard.Allow() {
return errors.New("connection rate limit exceeded, retry later")
}
return nil
}
// 2. For genuinely new connections, check the DB blacklist and apply the burst guard.
// ValidateHeartbeat: fired on every heartbeat tick for an established stream.
// Checks heartbeat cadence — rejects if the node is sending too fast.
@@ -162,7 +181,11 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
// Build and send a HeartbeatResponse after each received node heartbeat.
// Raw metrics only — no pre-cooked score. Node computes the score itself.
ix.BuildHeartbeatResponse = func(remotePeer pp.ID, need int, challenges []string, challengeDID string, referent bool, rawRecord json.RawMessage) *common.HeartbeatResponse {
ix.BuildHeartbeatResponse = func(remotePeer pp.ID, hb *common.Heartbeat) *common.HeartbeatResponse {
logger := oclib.GetLogger()
need, challenges, challengeDID, referent, rawRecord :=
hb.Need, hb.Challenges, hb.ChallengeDID, hb.Referent, hb.Record
ix.StreamMU.RLock()
peerCount := len(ix.StreamRecords[common.ProtocolHeartbeat])
// Collect lastSeen per active peer for challenge responses.
@@ -197,6 +220,31 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
// Update referent designation: node marks its best-scored indexer with Referent=true.
ix.updateReferent(remotePeer, remotePeerRecord, referent)
// SWIM refutation: if the node signals our current incarnation as suspected,
// increment it and broadcast an alive event so other nodes can clear suspicion.
inc := ix.incarnation.Load()
if hb.SuspectedIncarnation != nil && *hb.SuspectedIncarnation == inc {
inc = ix.incarnation.Add(1)
logger.Info().
Str("suspected_by", remotePeer.String()).
Uint64("new_incarnation", inc).
Msg("[swim] refuting suspicion — incarnation incremented")
ix.eventQueue.Add(common.MemberEvent{
Type: common.MemberAlive,
PeerID: ix.Host.ID().String(),
Incarnation: inc,
HopsLeft: common.InitialEventHops,
})
}
// Relay incoming SWIM events from the node into our event queue so they
// propagate to other connected nodes (infection-style forwarding).
for _, ev := range hb.MembershipEvents {
if ev.HopsLeft > 0 {
ix.eventQueue.Add(ev)
}
}
maxN := ix.MaxNodesConn()
fillRate := 0.0
if maxN > 0 {
@@ -356,6 +404,10 @@ func NewIndexerService(h host.Host, ps *pubsub.PubSub, maxNode int) *IndexerServ
}()
}
// Attach SWIM incarnation and piggybacked membership events.
resp.Incarnation = ix.incarnation.Load()
resp.MembershipEvents = ix.eventQueue.Drain(5)
return resp
}
@@ -489,6 +541,23 @@ func (ix *IndexerService) startDHTProvide(fillRateFn func() float64) {
}()
}
// EvictPeer immediately closes the heartbeat stream of a peer and removes it
// from the active stream records. Used when a peer is auto-blacklisted.
func (ix *IndexerService) EvictPeer(peerID string) {
pid, err := pp.Decode(peerID)
if err != nil {
return
}
ix.StreamMU.Lock()
defer ix.StreamMU.Unlock()
if rec, ok := ix.StreamRecords[common.ProtocolHeartbeat][pid]; ok {
if rec.HeartbeatStream != nil && rec.HeartbeatStream.Stream != nil {
rec.HeartbeatStream.Stream.Reset()
}
delete(ix.StreamRecords[common.ProtocolHeartbeat], pid)
}
}
func (ix *IndexerService) Close() {
if ix.dhtProvideCancel != nil {
ix.dhtProvideCancel()

View File

@@ -19,6 +19,21 @@ func (v DefaultValidator) Select(key string, values [][]byte) (int, error) {
type PeerRecordValidator struct{}
func (v PeerRecordValidator) Validate(key string, value []byte) error {
// Accept valid tombstones — deletion must be storable so it can propagate
// and win over stale live records on other DHT nodes via Select().
var ts TombstoneRecord
if err := json.Unmarshal(value, &ts); err == nil && ts.Tombstone {
if ts.PeerID == "" || ts.DID == "" {
return errors.New("tombstone: missing fields")
}
if time.Since(ts.DeletedAt) > tombstoneTTL {
return errors.New("tombstone: expired")
}
if _, err := ts.Verify(); err != nil {
return errors.New("tombstone: " + err.Error())
}
return nil
}
var rec PeerRecord
if err := json.Unmarshal(value, &rec); err != nil {
@@ -35,6 +50,12 @@ func (v PeerRecordValidator) Validate(key string, value []byte) error {
return errors.New("record expired")
}
// TTL cap: publisher cannot set an expiry further than maxTTLSeconds in
// the future. Prevents abuse (e.g. records designed to linger for years).
if rec.ExpiryDate.After(time.Now().UTC().Add(maxTTLSeconds * time.Second)) {
return errors.New("TTL exceeds maximum allowed")
}
// Signature verification
if _, err := rec.Verify(); err != nil {
return errors.New("invalid signature")
@@ -44,6 +65,14 @@ func (v PeerRecordValidator) Validate(key string, value []byte) error {
}
func (v PeerRecordValidator) Select(key string, values [][]byte) (int, error) {
// Tombstone always wins: a signed delete supersedes any live record,
// even if the live record has a later ExpiryDate.
for i, val := range values {
var ts TombstoneRecord
if err := json.Unmarshal(val, &ts); err == nil && ts.Tombstone {
return i, nil
}
}
var newest time.Time
index := 0