Discovery Neo Oclib
This commit is contained in:
+181
-61
@@ -27,7 +27,7 @@ const ProtocolObserve = "/opencloud/peer/observe/1.0"
|
||||
// observeHBEventType is used as the common.Event.Type for heartbeat responses.
|
||||
const observeHBEventType = "/opencloud/peer/observe/heartbeat"
|
||||
|
||||
const observeHBInterval = 30 * time.Second
|
||||
const observeHBInterval = 10 * time.Second
|
||||
const observeDrainDuration = 30 * time.Second
|
||||
|
||||
// observeBatchWindow is the accumulation window before a heartbeat batch is
|
||||
@@ -45,7 +45,95 @@ type ObserveRequest struct {
|
||||
|
||||
// ObserveHeartbeat is sent by the observed side every observeHBInterval.
|
||||
type ObserveHeartbeat struct {
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
State string `json:"state"` // always "online" when actively emitted
|
||||
SentAt time.Time `json:"sent_at,omitempty"` // timestamp set by sender; lets receiver compute one-way latency
|
||||
}
|
||||
|
||||
const (
|
||||
maxLatencyMs = 2000.0 // ms above which latency score → 0
|
||||
latencySamples = 5 // sliding window size for latency averaging
|
||||
fastThresholdMs = 200.0 // below = "fast", above = "slow"
|
||||
reliableThreshold = 0.95 // miss_rate below 5% = "reliable"
|
||||
)
|
||||
|
||||
// PeerObserveMetrics accumulates connection-quality data for one observed peer.
|
||||
// Updated on every incoming heartbeat (observing side).
|
||||
type PeerObserveMetrics struct {
|
||||
mu sync.Mutex
|
||||
firstObservedAt time.Time
|
||||
lastHeartbeatAt time.Time
|
||||
received uint64
|
||||
latencies [latencySamples]time.Duration
|
||||
latIdx int
|
||||
latCount int
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) record(latency time.Duration) {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
m.received++
|
||||
m.lastHeartbeatAt = time.Now().UTC()
|
||||
m.latencies[m.latIdx%latencySamples] = latency
|
||||
m.latIdx++
|
||||
if m.latCount < latencySamples {
|
||||
m.latCount++
|
||||
}
|
||||
}
|
||||
|
||||
func (m *PeerObserveMetrics) snapshot() PeerObserveSnapshot {
|
||||
m.mu.Lock()
|
||||
defer m.mu.Unlock()
|
||||
var total time.Duration
|
||||
for i := 0; i < m.latCount; i++ {
|
||||
total += m.latencies[i]
|
||||
}
|
||||
var avgMs float64
|
||||
if m.latCount > 0 {
|
||||
avgMs = float64(total.Milliseconds()) / float64(m.latCount)
|
||||
}
|
||||
expected := int64(time.Duration(m.lastHeartbeatAt.Second()-m.firstObservedAt.Second()) / observeHBInterval)
|
||||
fmt.Println("EXPECTED", expected, m.received)
|
||||
var missRate float64
|
||||
if expected > 0 {
|
||||
recv := int64(m.received)
|
||||
if recv > expected {
|
||||
recv = expected
|
||||
}
|
||||
missRate = 1.0 - float64(recv)/float64(expected)
|
||||
}
|
||||
latScore := 1.0 - avgMs/maxLatencyMs
|
||||
if latScore < 0 {
|
||||
latScore = 0
|
||||
}
|
||||
relScore := 1.0 - missRate
|
||||
trust := (0.35*latScore + 0.65*relScore) * 100
|
||||
|
||||
speed := "fast"
|
||||
if avgMs >= fastThresholdMs {
|
||||
speed = "slow"
|
||||
}
|
||||
reliability := "reliable"
|
||||
if relScore < reliableThreshold {
|
||||
reliability = "watch"
|
||||
}
|
||||
return PeerObserveSnapshot{
|
||||
LatencyMs: avgMs,
|
||||
Speed: speed,
|
||||
Reliability: reliability,
|
||||
TrustScore: trust,
|
||||
LastSeenAt: m.lastHeartbeatAt,
|
||||
MissRate: missRate,
|
||||
}
|
||||
}
|
||||
|
||||
// PeerObserveSnapshot is the point-in-time quality summary sent to oc-peer via NATS.
|
||||
type PeerObserveSnapshot struct {
|
||||
LatencyMs float64 `json:"latency_ms"`
|
||||
Speed string `json:"speed"` // "fast" | "slow"
|
||||
Reliability string `json:"reliability"` // "reliable" | "watch"
|
||||
TrustScore float64 `json:"trust_score"`
|
||||
LastSeenAt time.Time `json:"last_seen_at"`
|
||||
MissRate float64 `json:"miss_rate"`
|
||||
}
|
||||
|
||||
// ShallowPeer is the minimal peer representation sent by oc-peer in a
|
||||
@@ -204,18 +292,13 @@ func flushObserveBatch(peerIDs []string) {
|
||||
|
||||
// ── incoming observe handler (observed side) ──────────────────────────────────
|
||||
|
||||
// handleIncomingObserve is registered as the ProtocolObserve stream handler.
|
||||
// It is called when a remote peer opens an observe stream to us.
|
||||
// The function reads the request, validates it, then starts (or stops) the
|
||||
// heartbeat goroutine and returns immediately — the goroutine owns the stream.
|
||||
// handleIncomingObserve is called when a remote peer opens an observe stream
|
||||
// to us (observed side). It starts a heartbeat goroutine that writes back on
|
||||
// the same bidirectional rawStream — no separate reverse stream is opened.
|
||||
// The goroutine stops via context cancellation (triggered by a close event
|
||||
// read from rawStream) or when rawStream becomes unwritable.
|
||||
func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
remotePeerID := rawStream.Conn().RemotePeer().String()
|
||||
addr := rawStream.Conn().RemoteMultiaddr().String()
|
||||
ad, err := pp.AddrInfoFromString(addr + "/p2p/" + remotePeerID)
|
||||
if err != nil {
|
||||
fmt.Println("qndlqnl EERR", addr, err)
|
||||
return err
|
||||
}
|
||||
log := oclib.GetLogger()
|
||||
|
||||
// Drain mode: reject any new observations for 30 s after a close-all.
|
||||
@@ -223,13 +306,11 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
draining := !s.drainUntil.IsZero() && time.Now().Before(s.drainUntil)
|
||||
s.drainMu.RUnlock()
|
||||
if draining {
|
||||
rawStream.Close()
|
||||
fmt.Println("Draining")
|
||||
return errors.New("Draining")
|
||||
return errors.New("draining")
|
||||
}
|
||||
// Read the observe request (with a generous deadline to avoid hangs).
|
||||
// Guard: the requesting peer must not be blacklisted or be ourself.
|
||||
did := ""
|
||||
|
||||
// Guard: the requesting peer must not be blacklisted.
|
||||
access := oclib.NewRequestAdmin(oclib.LibDataEnum(oclib.PEER), nil)
|
||||
res := access.Search(&dbs.Filters{
|
||||
And: map[string][]dbs.Filter{
|
||||
@@ -238,11 +319,9 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
}, "", false, 0, 1)
|
||||
if len(res.Data) > 0 {
|
||||
p := res.Data[0].(*peer.Peer)
|
||||
did = p.GetID()
|
||||
if p.Relation == peer.BLACKLIST { // || p.Relation == peer.SELF
|
||||
rawStream.Close()
|
||||
if p.Relation == peer.BLACKLIST {
|
||||
fmt.Println("CLOSE blacklist or self")
|
||||
return errors.New("can't exploit blacklist or self")
|
||||
return errors.New("can't observe blacklisted peer")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,52 +330,32 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
s.observeCache.set(remotePeerID, cancel)
|
||||
fmt.Println("LOOP OBSERVE")
|
||||
go func() {
|
||||
defer rawStream.Close()
|
||||
// Do NOT close rawStream here: the persistent readLoop (HandleResponse)
|
||||
// owns rawStream's lifecycle. We only stop writing.
|
||||
defer cancel()
|
||||
defer s.observeCache.delete(remotePeerID)
|
||||
|
||||
ticker := time.NewTicker(observeHBInterval)
|
||||
defer ticker.Stop()
|
||||
|
||||
hbPayload, _ := json.Marshal(ObserveHeartbeat{State: "online"})
|
||||
evt := common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", hbPayload)
|
||||
if evt == nil {
|
||||
return
|
||||
}
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
buildHBEvent := func() *common.Event {
|
||||
p, _ := json.Marshal(ObserveHeartbeat{State: "online", SentAt: time.Now().UTC()})
|
||||
return common.NewEvent(observeHBEventType, s.Host.ID().String(), nil, "", p)
|
||||
}
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-ticker.C:
|
||||
|
||||
rawStream.SetWriteDeadline(time.Now().Add(5 * time.Second))
|
||||
fmt.Println("LOOP EVT", evt)
|
||||
var err error
|
||||
if s.Streams, err = common.TempStream(s.Host, *ad, ProtocolObserve, did, s.Streams, protocols, &s.Mu); err == nil {
|
||||
stream := s.Streams[ProtocolObserve][ad.ID]
|
||||
if err := json.NewEncoder(stream.Stream).Encode(evt); err != nil {
|
||||
// Moderate connectivity event: the observer is unreachable.
|
||||
// The deferred calls above purge this observer from the cache.
|
||||
fmt.Println("LOOP EVT ERR", err)
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — moderate connectivity event, purging observer from cache")
|
||||
return
|
||||
}
|
||||
evt := buildHBEvent()
|
||||
if err := json.NewEncoder(rawStream).Encode(evt); err != nil {
|
||||
log.Info().
|
||||
Str("observer", remotePeerID).
|
||||
Err(err).
|
||||
Msg("[observe] heartbeat write failed — stream closed, stopping goroutine")
|
||||
return
|
||||
}
|
||||
rawStream.SetWriteDeadline(time.Time{})
|
||||
}
|
||||
@@ -308,14 +367,65 @@ func (s *StreamService) handleIncomingObserve(rawStream network.Stream) error {
|
||||
// ── heartbeat receiver (observing side) ───────────────────────────────────────
|
||||
|
||||
// handleObserveHeartbeat is called by readLoop when a heartbeat event arrives
|
||||
// on an outgoing ProtocolObserve stream. It queues the peer_id in the batch
|
||||
// accumulator; the batcher flushes to NATS after observeBatchWindow.
|
||||
// on an outgoing ProtocolObserve stream. It updates per-peer metrics and flushes
|
||||
// a quality snapshot to NATS.
|
||||
func (ps *StreamService) handleObserveHeartbeat(evt *common.Event) error {
|
||||
// ps.hbBatcher.add(evt.From)
|
||||
flushObserveBatch([]string{evt.From})
|
||||
var hb ObserveHeartbeat
|
||||
if err := json.Unmarshal(evt.Payload, &hb); err == nil && !hb.SentAt.IsZero() {
|
||||
latency := time.Since(hb.SentAt)
|
||||
raw, _ := ps.observeMetrics.LoadOrStore(evt.From, &PeerObserveMetrics{
|
||||
firstObservedAt: time.Now().UTC(),
|
||||
})
|
||||
raw.(*PeerObserveMetrics).record(latency)
|
||||
fmt.Println("METRICS", raw)
|
||||
ps.observeMetrics.Store(evt.From, raw)
|
||||
}
|
||||
ps.flushObserveForPeer(evt.From, evt.User)
|
||||
return nil
|
||||
}
|
||||
|
||||
// flushObserveForPeer sends a PEER_OBSERVE_RESPONSE_EVENT to NATS with a quality
|
||||
// snapshot for peerID. Replaces the old flushObserveBatch (single-peer variant).
|
||||
func (ps *StreamService) flushObserveForPeer(peerID string, user string) {
|
||||
var snap *PeerObserveSnapshot
|
||||
if raw, ok := ps.observeMetrics.Load(peerID); ok {
|
||||
fmt.Println("RETRIEVED METRICS", raw)
|
||||
s := raw.(*PeerObserveMetrics).snapshot()
|
||||
snap = &s
|
||||
}
|
||||
fmt.Println("RETRIEVED METRICS 2", snap)
|
||||
payload, err := json.Marshal(map[string]interface{}{
|
||||
"peer_ids": []string{peerID},
|
||||
"state": "online",
|
||||
"metrics": map[string]*PeerObserveSnapshot{peerID: snap},
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PEER_OBSERVE_RESPONSE_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PEER_OBSERVE_RESPONSE_EVENT),
|
||||
Payload: payload,
|
||||
})
|
||||
propPayload, err := json.Marshal(tools.PropalgationMessage{
|
||||
DataType: int(tools.PEER),
|
||||
Action: tools.PB_PROPAGATE,
|
||||
Payload: payload,
|
||||
})
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
tools.NewNATSCaller().SetNATSPub(tools.PROPALGATION_EVENT, tools.NATSResponse{
|
||||
FromApp: "oc-discovery",
|
||||
Datatype: tools.PEER,
|
||||
User: user,
|
||||
Method: int(tools.PROPALGATION_EVENT),
|
||||
Payload: propPayload,
|
||||
})
|
||||
}
|
||||
|
||||
// ── user→peer index (ref-counted observe management) ─────────────────────────
|
||||
|
||||
// userPeerIndex tracks which users are observing which peers.
|
||||
@@ -514,7 +624,8 @@ func (ps *StreamService) openObserveStream(p ShallowPeer) error {
|
||||
}
|
||||
|
||||
// closeObserveStream closes the ProtocolObserve stream to toPeerID and notifies
|
||||
// the remote side.
|
||||
// the remote side. The close event is wrapped in a common.Event so the remote's
|
||||
// persistent readLoop can decode and handle it (cancel the heartbeat goroutine).
|
||||
func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
decodedID, err := pp.Decode(toPeerID)
|
||||
if err != nil {
|
||||
@@ -523,12 +634,15 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
ps.Mu.Lock()
|
||||
if ps.Streams[ProtocolObserve] != nil {
|
||||
if s, ok := ps.Streams[ProtocolObserve][decodedID]; ok {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
delete(ps.Streams[ProtocolObserve], decodedID)
|
||||
}
|
||||
}
|
||||
ps.Mu.Unlock()
|
||||
ps.observeMetrics.Delete(toPeerID)
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -537,7 +651,9 @@ func (ps *StreamService) closeObserveStream(toPeerID string) error {
|
||||
func (ps *StreamService) CloseAllObserves() {
|
||||
ps.Mu.Lock()
|
||||
for _, s := range ps.Streams[ProtocolObserve] {
|
||||
_ = json.NewEncoder(s.Stream).Encode(ObserveRequest{Close: true})
|
||||
closePayload, _ := json.Marshal(ObserveRequest{Close: true})
|
||||
closeEvt := common.NewEvent(ProtocolObserve, ps.Host.ID().String(), nil, "", closePayload)
|
||||
_ = json.NewEncoder(s.Stream).Encode(closeEvt)
|
||||
s.Stream.Close()
|
||||
}
|
||||
delete(ps.Streams, ProtocolObserve)
|
||||
@@ -545,6 +661,10 @@ func (ps *StreamService) CloseAllObserves() {
|
||||
|
||||
// Reset user index so stale ref-counts don't block future opens.
|
||||
ps.observeUsers = newUserPeerIndex()
|
||||
ps.observeMetrics.Range(func(k, _ any) bool {
|
||||
ps.observeMetrics.Delete(k)
|
||||
return true
|
||||
})
|
||||
|
||||
ps.drainMu.Lock()
|
||||
ps.drainUntil = time.Now().Add(observeDrainDuration)
|
||||
|
||||
Reference in New Issue
Block a user