From a10021fb982098e7732ea93a5ce50fb330258c43 Mon Sep 17 00:00:00 2001 From: pb Date: Wed, 28 May 2025 11:30:22 +0200 Subject: [PATCH] added documentation for the RAM consumption tests we jsut did --- docs/performance_test/README.md | 135 ++++++++++++++++++++ docs/performance_test/insert_exec.sh | 71 ++++++++++ docs/performance_test/performance_report.md | 0 3 files changed, 206 insertions(+) create mode 100644 docs/performance_test/README.md create mode 100755 docs/performance_test/insert_exec.sh create mode 100644 docs/performance_test/performance_report.md diff --git a/docs/performance_test/README.md b/docs/performance_test/README.md new file mode 100644 index 0000000..bad5653 --- /dev/null +++ b/docs/performance_test/README.md @@ -0,0 +1,135 @@ +# Goals + +This originated from a demand to know how much RAM is consummed by Open Cloud when running a large number of workflow at the same time on the same node. + +We differentiated between differents components : + +- The "oc-stack", which is the minimum set of services to be able to create and schedule a workflow execution : oc-auth, oc-datacenter, oc-scheduler, oc-front, oc-schedulerd, oc-workflow, oc-catalog, oc-peer, oc-workspace, loki, mongo, traefik and nats + +- oc-monitord, which is the daemon instanciated by the scheduling daemon (oc-schedulerd) that created the YAML for argo and creates the necessary kubernetes ressources. + +We monitor both parts to view how much RAM the oc-stack uses before / during / after the execution, the RAM consummed by the monitord containers and the total of the stack and monitors. + +# Setup + +In order to have optimal performance we used a Promox server with high ressources (>370 GiB RAM and 128 cores) to hosts two VMs composing our Kubernetes cluster, with one control plane node were the oc stack is running and a worker node with only k3s running. + +## VMs + +We instantiated a 2 node kubernetes (with k3s) cluster on the superg PVE (https://superg-pve.irtse-pf.ext:8006/) + +### VM Control + +This vm is running the oc stack and the monitord containers, it carries the biggest part of the load. It must have k3s and argo installed. We allocated **62 GiB of RAM** and **31 cores**. + +### VM Worker + +This VM is holding the workload for all the pods created, acting as a worker node for the k3s cluster. We deploy k3s as a nodes as explained in the K3S quick start guide : + +`curl -sfL https://get.k3s.io | K3S_URL=https://myserver:6443 K3S_TOKEN=mynodetoken sh -` + +The value to use for K3S_TOKEN is stored at `/var/lib/rancher/k3s/server/node-token` on the server node. + +Verify that the server has been added as a node to the cluster on the control plane with `kubectl get nodes` and look for the hostname of the worker VM on the list of nodes. + +### Delegate pods to the worker node + +In order for the pods to be executed on another node we need to modify how we construct he Argo YAML, to add a label in the metadata. We have added the needed attributes to the `Spec` struct in `oc-monitord` on the `test-ram` branch. + +```go +type Spec struct { + ServiceAccountName string `yaml:"serviceAccountName"` + Entrypoint string `yaml:"entrypoint"` + Arguments []Parameter `yaml:"arguments,omitempty"` + Volumes []VolumeClaimTemplate `yaml:"volumeClaimTemplates,omitempty"` + Templates []Template `yaml:"templates"` + Timeout int `yaml:"activeDeadlineSeconds,omitempty"` + NodeSelector struct{ + NodeRole string `yaml:"node-role"` + } `yaml:"nodeSelector"` +} +``` + +and added the tag in the `CreateDAG()` method : + +```go +b.Workflow.Spec.NodeSelector.NodeRole = "worker" +``` + +## Container monitoring + +Docker compose to instantiate the monitoring stack : +- Prometheus : storing data +- Cadvisor : monitoring of the containers + +```yml +version: '3.2' +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - 9090:9090 + command: + - --config.file=/etc/prometheus/prometheus.yml + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + depends_on: + - cadvisor + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + container_name: cadvisor + ports: + - 9999:8080 + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + +``` + +Prometheus scrapping configuration : + +```yml +scrape_configs: +- job_name: cadvisor + scrape_interval: 5s + static_configs: + - targets: + - cadvisor:8080 +``` + +## Dashboards + +In order to monitor the ressource consumption during our tests we need to create dashboard in Grafana. + +We create 4 different queries using Prometheus as the data source. For each query we can use the `code` mode to create them from a PromQL query. + +## OC stack consumption + +``` +sum(container_memory_usage_bytes{name=~"oc-auth|oc-datacenter|oc-scheduler|oc-front|oc-schedulerd|oc-workflow|oc-catalog|oc-peer|oc-workspace|loki|mongo|traefik|nats"}) +``` + +## Monitord consumption + +``` +sum(container_memory_usage_bytes{image="oc-monitord"}) +``` + +## Total RAM consumption + +``` +sum( + container_memory_usage_bytes{name=~"oc-auth|oc-datacenter|oc-scheduler|oc-front|oc-schedulerd|oc-workflow|oc-catalog|oc-peer|oc-workspace|loki|mongo|traefik|nats"} + or + container_memory_usage_bytes{image="oc-monitord"} +) +``` + +## Number of monitord containers + +``` +count(container_memory_usage_bytes{image="oc-monitord"} > 0) +``` \ No newline at end of file diff --git a/docs/performance_test/insert_exec.sh b/docs/performance_test/insert_exec.sh new file mode 100755 index 0000000..2a07c14 --- /dev/null +++ b/docs/performance_test/insert_exec.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +TOKEN="LCSg2svSFdIBPucRCCmz7UrDmZKmsvODXU5PM0yWdYc.-LPNAoycUm5Kwm1IcyyP-wJvc8Zy7er36ozpjTWICKc." + +NB_EXEC=$1 +TIME=$2 + +if [ -z "$NB_EXEC" ]; then + NB_EXEC=1 +fi + +# if (( NB_EXEC % 10 != 0 )); then +# echo "Met un chiffre rond stp" +# exit 0 +# fi + +if [ -z "$TIME" ]; then + TIME=1 +fi + + +EXECS=$(((NB_EXEC+9) / 10)) +echo EXECS=$EXECS + +DAY=$(date +%d -u) +MONTH=$(date +%m -u) +HOUR=$(date +%H -u) +MINUTE=$(date -d "$TIME min" +"%M" -u) +SECOND=$(date +%s -u) + +start_loop=$(date +%s) + +for ((i = 1; i <= $EXECS; i++)); do + ( + start_req=$(date +%s) + + echo "Exec $i" + CRON="0-10 $MINUTE $HOUR $DAY $MONTH *" + echo "$CRON" + + START="2025-$MONTH-$DAY"T"$HOUR:$MINUTE:00.012Z" + + END_MONTH=$(printf "%02d" $((MONTH + 1))) + END="2025-$END_MONTH-$DAY"T"$HOUR:$MINUTE:00.012Z" + + # PAYLOAD=$(printf '{"id":null,"name":null,"cron":"","mode":1,"start":"%s","end":"%s"}' "$START" "$END") + PAYLOAD=$(printf '{"id":null,"name":null,"cron":"%s","mode":1,"start":"%s","end":"%s"}' "$CRON" "$START" "$END") + + # echo $PAYLOAD + + curl -X 'POST' 'http://localhost:8000/scheduler/62c55a70-e295-45e6-9925-a98137d59abc' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d "$PAYLOAD" \ + -H "Authorization: Bearer $TOKEN" -w '\n' + + end=$(date +%s) + duration=$((end - start_req)) + + echo "Début $start_req" + echo "Fin $end" + echo "Durée d'exécution $i : $duration secondes" + )& + +done + +wait + +end_loop=$(date +%s) +total_time=$((end_loop - start_loop)) +echo "Durée d'exécution total : $total_time secondes" diff --git a/docs/performance_test/performance_report.md b/docs/performance_test/performance_report.md new file mode 100644 index 0000000..e69de29