Discovery Neo Oclib

This commit is contained in:
mr
2026-05-27 16:17:00 +02:00
parent 7f951afd41
commit 6ce6e6fe7d
20 changed files with 1436 additions and 1133 deletions
+181 -61
View File
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
const observeHBInterval = 30 * time.Second
const observeHBInterval = 10 * time.Second
const observeDrainDuration = 30 * time.Second
// observeBatchWindow is the accumulation window before a heartbeat batch is
@@ -45,7 +45,95 @@ type ObserveRequest struct {
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
type ObserveHeartbeat struct {
State string `json:"state"` // always "online" when actively emitted
State string `json:"state"` // always "online" when actively emitted
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
}
const (
maxLatencyMs = 2000.0 // ms above which latency score → 0
latencySamples = 5 // sliding window size for latency averaging
fastThresholdMs = 200.0 // below = "fast", above = "slow"
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
)
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
// Updated on every incoming heartbeat (observing side).
type PeerObserveMetrics struct {
mu sync.Mutex
firstObservedAt time.Time
lastHeartbeatAt time.Time
received uint64
latencies [latencySamples]time.Duration
latIdx int
latCount int
}
func (m *PeerObserveMetrics) record(latency time.Duration) {
m.mu.Lock()
defer m.mu.Unlock()
m.received++
m.lastHeartbeatAt = time.Now().UTC()
m.latencies[m.latIdx%latencySamples] = latency
m.latIdx++
if m.latCount < latencySamples {
m.latCount++
}
}
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
m.mu.Lock()
defer m.mu.Unlock()
var total time.Duration
for i := 0; i < m.latCount; i++ {
total += m.latencies[i]
}
var avgMs float64
if m.latCount > 0 {
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
}
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
fmt.Println("EXPECTED", expected, m.received)
var missRate float64
if expected > 0 {
recv := int64(m.received)
if recv > expected {
recv = expected
}
missRate = 1.0 - float64(recv)/float64(expected)
}
latScore := 1.0 - avgMs/maxLatencyMs
if latScore < 0 {
latScore = 0
}
relScore := 1.0 - missRate
trust := (0.35*latScore + 0.65*relScore) * 100
speed := "fast"
if avgMs >= fastThresholdMs {
speed = "slow"
}
reliability := "reliable"
if relScore < reliableThreshold {
reliability = "watch"
}
return PeerObserveSnapshot{
LatencyMs: avgMs,
Speed: speed,
Reliability: reliability,
TrustScore: trust,
LastSeenAt: m.lastHeartbeatAt,
MissRate: missRate,
}
}
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
type PeerObserveSnapshot struct {
LatencyMs float64 `json:"latency_ms"`
Speed string `json:"speed"` // "fast" | "slow"
Reliability string `json:"reliability"` // "reliable" | "watch"
TrustScore float64 `json:"trust_score"`
LastSeenAt time.Time `json:"last_seen_at"`
MissRate float64 `json:"miss_rate"`
}
// ShallowPeer is the minimal peer representation sent by oc-peer in a
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
// ── incoming observe handler (observed side) ──────────────────────────────────
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
// It is called when a remote peer opens an observe stream to us.
// The function reads the request, validates it, then starts (or stops) the
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
// handleIncomingObserve is called when a remote peer opens an observe stream
// to us (observed side). It starts a heartbeat goroutine that writes back on
// the same bidirectional rawStream — no separate reverse stream is opened.
// The goroutine stops via context cancellation (triggered by a close event
// read from rawStream) or when rawStream becomes unwritable.
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
remotePeerID := rawStream.Conn().RemotePeer().String()
addr := rawStream.Conn().RemoteMultiaddr().String()
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
if err != nil {
fmt.Println("qndlqnl EERR", addr, err)
return err
}
log := oclib.GetLogger()
// Drain mode: reject any new observations for 30 s after a close-all.
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
s.drainMu.RUnlock()
if draining {
rawStream.Close()
fmt.Println("Draining")
return errors.New("Draining")
return errors.New("draining")
}
// Read the observe request (with a generous deadline to avoid hangs).
// Guard: the requesting peer must not be blacklisted or be ourself.
did := ""
// Guard: the requesting peer must not be blacklisted.
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
res := access.Search(&dbs.Filters{
And: map[string][]dbs.Filter{
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
}, "", false, 0, 1)
if len(res.Data) > 0 {
p := res.Data[0].(*peer.Peer)
did = p.GetID()
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
rawStream.Close()
if p.Relation == peer.BLACKLIST {
fmt.Println("CLOSE blacklist or self")
return errors.New("can't exploit blacklist or self")
return errors.New("can't observe blacklisted peer")
}
}
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
s.observeCache.set(remotePeerID, cancel)
fmt.Println("LOOP OBSERVE")
go func() {
defer rawStream.Close()
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
// owns rawStream's lifecycle. We only stop writing.
defer cancel()
defer s.observeCache.delete(remotePeerID)
ticker := time.NewTicker(observeHBInterval)
defer ticker.Stop()
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
if evt == nil {
return
}
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
buildHBEvent := func() *common.Event {
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
}
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
fmt.Println("LOOP EVT", evt)
var err error
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
stream := s.Streams[ProtocolObserve][ad.ID]
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
// Moderate connectivity event: the observer is unreachable.
// The deferred calls above purge this observer from the cache.
fmt.Println("LOOP EVT ERR", err)
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
return
}
evt := buildHBEvent()
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
log.Info().
Str("observer", remotePeerID).
Err(err).
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
return
}
rawStream.SetWriteDeadline(time.Time{})
}
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
// ── heartbeat receiver (observing side) ───────────────────────────────────────
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
// accumulator; the batcher flushes to NATS after observeBatchWindow.
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
// a quality snapshot to NATS.
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
// ps.hbBatcher.add(evt.From)
flushObserveBatch([]string{evt.From})
var hb ObserveHeartbeat
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
latency := time.Since(hb.SentAt)
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
firstObservedAt: time.Now().UTC(),
})
raw.(*PeerObserveMetrics).record(latency)
fmt.Println("METRICS", raw)
ps.observeMetrics.Store(evt.From, raw)
}
ps.flushObserveForPeer(evt.From, evt.User)
return nil
}
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
var snap *PeerObserveSnapshot
if raw, ok := ps.observeMetrics.Load(peerID); ok {
fmt.Println("RETRIEVED METRICS", raw)
s := raw.(*PeerObserveMetrics).snapshot()
snap = &s
}
fmt.Println("RETRIEVED METRICS 2", snap)
payload, err := json.Marshal(map[string]interface{}{
"peer_ids": []string{peerID},
"state": "online",
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
Payload: payload,
})
propPayload, err := json.Marshal(tools.PropalgationMessage{
DataType: int(tools.PEER),
Action: tools.PB_PROPAGATE,
Payload: payload,
})
if err != nil {
return
}
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
FromApp: "oc-discovery",
Datatype: tools.PEER,
User: user,
Method: int(tools.PROPALGATION_EVENT),
Payload: propPayload,
})
}
// ── user→peer index (ref-counted observe management) ─────────────────────────
// userPeerIndex tracks which users are observing which peers.
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
}
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
// the remote side.
// the remote side. The close event is wrapped in a common.Event so the remote's
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
func (ps *StreamService) closeObserveStream(toPeerID string) error {
decodedID, err := pp.Decode(toPeerID)
if err != nil {
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
ps.Mu.Lock()
if ps.Streams[ProtocolObserve] != nil {
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
delete(ps.Streams[ProtocolObserve], decodedID)
}
}
ps.Mu.Unlock()
ps.observeMetrics.Delete(toPeerID)
return nil
}
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
func (ps *StreamService) CloseAllObserves() {
ps.Mu.Lock()
for _, s := range ps.Streams[ProtocolObserve] {
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
s.Stream.Close()
}
delete(ps.Streams, ProtocolObserve)
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
// Reset user index so stale ref-counts don't block future opens.
ps.observeUsers = newUserPeerIndex()
ps.observeMetrics.Range(func(k, _ any) bool {
ps.observeMetrics.Delete(k)
return true
})
ps.drainMu.Lock()
ps.drainUntil = time.Now().Add(observeDrainDuration)