From a10021fb982098e7732ea93a5ce50fb330258c43 Mon Sep 17 00:00:00 2001 From: pb Date: Wed, 28 May 2025 11:30:22 +0200 Subject: [PATCH] added documentation for the RAM consumption tests we jsut did --- docs/performance_test/README.md | 135 ++++++++++++++++++++ docs/performance_test/insert_exec.sh | 71 ++++++++++ docs/performance_test/performance_report.md | 0 3 files changed, 206 insertions(+) create mode 100644 docs/performance_test/README.md create mode 100755 docs/performance_test/insert_exec.sh create mode 100644 docs/performance_test/performance_report.md diff --git a/docs/performance_test/README.md b/docs/performance_test/README.md new file mode 100644 index 0000000..bad5653 --- /dev/null +++ b/docs/performance_test/README.md @@ -0,0 +1,135 @@ +# Goals + +This originated from a demand to know how much RAM is consummed by Open Cloud when running a large number of workflow at the same time on the same node. + +We differentiated between differents components : + +- The "oc-stack", which is the minimum set of services to be able to create and schedule a workflow execution : oc-auth, oc-datacenter, oc-scheduler, oc-front, oc-schedulerd, oc-workflow, oc-catalog, oc-peer, oc-workspace, loki, mongo, traefik and nats + +- oc-monitord, which is the daemon instanciated by the scheduling daemon (oc-schedulerd) that created the YAML for argo and creates the necessary kubernetes ressources. + +We monitor both parts to view how much RAM the oc-stack uses before / during / after the execution, the RAM consummed by the monitord containers and the total of the stack and monitors. + +# Setup + +In order to have optimal performance we used a Promox server with high ressources (>370 GiB RAM and 128 cores) to hosts two VMs composing our Kubernetes cluster, with one control plane node were the oc stack is running and a worker node with only k3s running. + +## VMs + +We instantiated a 2 node kubernetes (with k3s) cluster on the superg PVE (https://superg-pve.irtse-pf.ext:8006/) + +### VM Control + +This vm is running the oc stack and the monitord containers, it carries the biggest part of the load. It must have k3s and argo installed. We allocated **62 GiB of RAM** and **31 cores**. + +### VM Worker + +This VM is holding the workload for all the pods created, acting as a worker node for the k3s cluster. We deploy k3s as a nodes as explained in the K3S quick start guide : + +`curl -sfL https://get.k3s.io | K3S_URL=https://myserver:6443 K3S_TOKEN=mynodetoken sh -` + +The value to use for K3S_TOKEN is stored at `/var/lib/rancher/k3s/server/node-token` on the server node. + +Verify that the server has been added as a node to the cluster on the control plane with `kubectl get nodes` and look for the hostname of the worker VM on the list of nodes. + +### Delegate pods to the worker node + +In order for the pods to be executed on another node we need to modify how we construct he Argo YAML, to add a label in the metadata. We have added the needed attributes to the `Spec` struct in `oc-monitord` on the `test-ram` branch. + +```go +type Spec struct { + ServiceAccountName string `yaml:"serviceAccountName"` + Entrypoint string `yaml:"entrypoint"` + Arguments []Parameter `yaml:"arguments,omitempty"` + Volumes []VolumeClaimTemplate `yaml:"volumeClaimTemplates,omitempty"` + Templates []Template `yaml:"templates"` + Timeout int `yaml:"activeDeadlineSeconds,omitempty"` + NodeSelector struct{ + NodeRole string `yaml:"node-role"` + } `yaml:"nodeSelector"` +} +``` + +and added the tag in the `CreateDAG()` method : + +```go +b.Workflow.Spec.NodeSelector.NodeRole = "worker" +``` + +## Container monitoring + +Docker compose to instantiate the monitoring stack : +- Prometheus : storing data +- Cadvisor : monitoring of the containers + +```yml +version: '3.2' +services: + prometheus: + image: prom/prometheus:latest + container_name: prometheus + ports: + - 9090:9090 + command: + - --config.file=/etc/prometheus/prometheus.yml + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + depends_on: + - cadvisor + cadvisor: + image: gcr.io/cadvisor/cadvisor:latest + container_name: cadvisor + ports: + - 9999:8080 + volumes: + - /:/rootfs:ro + - /var/run:/var/run:rw + - /sys:/sys:ro + - /var/lib/docker/:/var/lib/docker:ro + +``` + +Prometheus scrapping configuration : + +```yml +scrape_configs: +- job_name: cadvisor + scrape_interval: 5s + static_configs: + - targets: + - cadvisor:8080 +``` + +## Dashboards + +In order to monitor the ressource consumption during our tests we need to create dashboard in Grafana. + +We create 4 different queries using Prometheus as the data source. For each query we can use the `code` mode to create them from a PromQL query. + +## OC stack consumption + +``` +sum(container_memory_usage_bytes{name=~"oc-auth|oc-datacenter|oc-scheduler|oc-front|oc-schedulerd|oc-workflow|oc-catalog|oc-peer|oc-workspace|loki|mongo|traefik|nats"}) +``` + +## Monitord consumption + +``` +sum(container_memory_usage_bytes{image="oc-monitord"}) +``` + +## Total RAM consumption + +``` +sum( + container_memory_usage_bytes{name=~"oc-auth|oc-datacenter|oc-scheduler|oc-front|oc-schedulerd|oc-workflow|oc-catalog|oc-peer|oc-workspace|loki|mongo|traefik|nats"} + or + container_memory_usage_bytes{image="oc-monitord"} +) +``` + +## Number of monitord containers + +``` +count(container_memory_usage_bytes{image="oc-monitord"} > 0) +``` \ No newline at end of file diff --git a/docs/performance_test/insert_exec.sh b/docs/performance_test/insert_exec.sh new file mode 100755 index 0000000..2a07c14 --- /dev/null +++ b/docs/performance_test/insert_exec.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +TOKEN="LCSg2svSFdIBPucRCCmz7UrDmZKmsvODXU5PM0yWdYc.-LPNAoycUm5Kwm1IcyyP-wJvc8Zy7er36ozpjTWICKc.{"session":{"access_token":{"DELETE__AUTH_GROUP_USER_ID_GROUP_ID":"/auth/group/:user_id/:group_id","DELETE__AUTH_PERMISSION_PERMISSION_ID_ROLE_ID_RELATION":"/auth/permission/:permission_id/:role_id/:relation","DELETE__AUTH_ROLE_USER_ID_ROLE_ID":"/auth/role/:user_id/:role_id","DELETE__CATALOG_COMPUTE_ID":"/catalog/compute/:id","DELETE__CATALOG_DATA_ID":"/catalog/data/:id","DELETE__CATALOG_PROCESSING_ID":"/catalog/processing/:id","DELETE__CATALOG_RESOURCE_ID":"/catalog/resource/:id","DELETE__CATALOG_STORAGE_ID":"/catalog/storage/:id","DELETE__CATALOG_WORKFLOW_ID":"/catalog/workflow/:id","DELETE__SCHEDULER_ID":"/scheduler/:id","DELETE__WORKFLOW_ID":"/workflow/:id","DELETE__WORKSPACE_ID":"/workspace/:id","GET__CATALOG_COMPUTE_":"/catalog/compute/","GET__CATALOG_COMPUTE_ID":"/catalog/compute/:id","GET__CATALOG_COMPUTE_SEARCH_SEARCH":"/catalog/compute/search/:search","GET__CATALOG_DATA_":"/catalog/data/","GET__CATALOG_DATA_ID":"/catalog/data/:id","GET__CATALOG_DATA_SEARCH_SEARCH":"/catalog/data/search/:search","GET__CATALOG_ENUM_BOOKING_STATUS":"/catalog/enum/booking/status","GET__CATALOG_ENUM_INFRASTRUCTURE":"/catalog/enum/infrastructure","GET__CATALOG_ENUM_PRICING_REFUND_TYPE":"/catalog/enum/pricing/refund/type","GET__CATALOG_ENUM_PRICING_STRATEGY_BUY":"/catalog/enum/pricing/strategy/buy","GET__CATALOG_ENUM_PRICING_STRATEGY_DATA":"/catalog/enum/pricing/strategy/data","GET__CATALOG_ENUM_PRICING_STRATEGY_PRIVILEGE":"/catalog/enum/pricing/strategy/privilege","GET__CATALOG_ENUM_PRICING_STRATEGY_PRIVILEGE_STORAGE":"/catalog/enum/pricing/strategy/privilege/storage","GET__CATALOG_ENUM_PRICING_STRATEGY_STORAGE":"/catalog/enum/pricing/strategy/storage","GET__CATALOG_ENUM_PRICING_STRATEGY_TIME":"/catalog/enum/pricing/strategy/time","GET__CATALOG_ENUM_RESOURCE_TYPE":"/catalog/enum/resource/type","GET__CATALOG_ENUM_STATUS":"/catalog/enum/status","GET__CATALOG_ENUM_STORAGE_SIZE":"/catalog/enum/storage/size","GET__CATALOG_ENUM_STORAGE_TYPE":"/catalog/enum/storage/type","GET__CATALOG_PROCESSING_":"/catalog/processing/","GET__CATALOG_PROCESSING_ID":"/catalog/processing/:id","GET__CATALOG_PROCESSING_SEARCH_SEARCH":"/catalog/processing/search/:search","GET__CATALOG_RESOURCE_":"/catalog/resource/","GET__CATALOG_RESOURCE_ID":"/catalog/resource/:id","GET__CATALOG_RESOURCE_SEARCH_SEARCH":"/catalog/resource/search/:search","GET__CATALOG_STORAGE_":"/catalog/storage/","GET__CATALOG_STORAGE_ID":"/catalog/storage/:id","GET__CATALOG_STORAGE_SEARCH_SEARCH":"/catalog/storage/search/:search","GET__CATALOG_VERSION_":"/catalog/version/","GET__CATALOG_VERSION_STATUS":"/catalog/version/status","GET__CATALOG_WORKFLOW_":"/catalog/workflow/","GET__CATALOG_WORKFLOW_ID":"/catalog/workflow/:id","GET__CATALOG_WORKFLOW_SEARCH_SEARCH":"/catalog/workflow/search/:search","GET__DATACENTER_":"/datacenter/","GET__DATACENTER_ADMIRALTY_KUBECONFIG_EXECUTION":"/datacenter/admiralty/kubeconfig/:execution","GET__DATACENTER_ADMIRALTY_NODE_EXECUTION_PEER":"/datacenter/admiralty/node/:execution/:peer","GET__DATACENTER_ADMIRALTY_SECRET_EXECUTION_PEER":"/datacenter/admiralty/secret/:execution/:peer","GET__DATACENTER_ADMIRALTY_TARGETS":"/datacenter/admiralty/targets","GET__DATACENTER_ADMIRALTY_TARGETS_EXECUTION":"/datacenter/admiralty/targets/:execution","GET__DATACENTER_BOOKING_":"/datacenter/booking/","GET__DATACENTER_BOOKING_CHECK_ID_START_DATE_END_DATE":"/datacenter/booking/check/:id/:start_date/:end_date","GET__DATACENTER_BOOKING_ID":"/datacenter/booking/:id","GET__DATACENTER_BOOKING_SEARCH_EXECUTION_ID":"/datacenter/booking/search/execution/:id","GET__DATACENTER_BOOKING_SEARCH_START_DATE_END_DATE":"/datacenter/booking/search/:start_date/:end_date","GET__DATACENTER_ID":"/datacenter/:id","GET__DATACENTER_SESSION_TOKEN_ID_DURATION":"/datacenter/session/token/:id/:duration","GET__DATACENTER_VERSION_":"/datacenter/version/","GET__DATACENTER_VERSION_STATUS":"/datacenter/version/status","GET__PEER_":"/peer/","GET__PEER_ID":"/peer/:id","GET__PEER_SEARCH_SEARCH":"/peer/search/:search","GET__PEER_VERSION_":"/peer/version/","GET__SCHEDULER_EXECUTION_":"/scheduler/execution/","GET__SCHEDULER_EXECUTION_ID":"/scheduler/execution/:id","GET__SCHEDULER_EXECUTION_SEARCH_SEARCH":"/scheduler/execution/search/:search","GET__SCHEDULER_EXECUTION_SEARCH_START_DATE_END_DATE":"/scheduler/execution/search/:start_date/:end_date","GET__SCHEDULER_ID_ORDER":"/scheduler/:id/order","GET__SCHEDULER_VERSION_":"/scheduler/version/","GET__SCHEDULER_VERSION_STATUS":"/scheduler/version/status","GET__WORKFLOW_":"/workflow/","GET__WORKFLOW_CHECK_ID_START_DATE_END_DATE":"/workflow/check/:id/:start_date/:end_date","GET__WORKFLOW_ID":"/workflow/:id","GET__WORKFLOW_SEARCH_SEARCH":"/workflow/search/:search","GET__WORKFLOW_VERSION_":"/workflow/version/","GET__WORKFLOW_VERSION_STATUS":"/workflow/version/status","GET__WORKSPACE_":"/workspace/","GET__WORKSPACE_ID":"/workspace/:id","GET__WORKSPACE_SEARCH_SEARCH":"/workspace/search/:search","GET__WORKSPACE_VERSION_":"/workspace/version/","GET__WORKSPACE_VERSION_STATUS":"/workspace/version/status","POST__AUTH_GROUP_ID":"/auth/group/:id","POST__AUTH_ROLE_ID":"/auth/role/:id","POST__CATALOG_COMPUTE_":"/catalog/compute/","POST__CATALOG_DATA_":"/catalog/data/","POST__CATALOG_PROCESSING_":"/catalog/processing/","POST__CATALOG_STORAGE_":"/catalog/storage/","POST__CATALOG_WORKFLOW_":"/catalog/workflow/","POST__DATACENTER_ADMIRALTY_SECRET_EXECUTION_PEER":"/datacenter/admiralty/secret/:execution/:peer","POST__DATACENTER_ADMIRALTY_SOURCE_EXECUTION":"/datacenter/admiralty/source/:execution","POST__DATACENTER_ADMIRALTY_TARGET_EXECUTION_PEER":"/datacenter/admiralty/target/:execution/:peer","POST__DATACENTER_BOOKING_":"/datacenter/booking/","POST__PEER_ID_BLACKLIST":"/peer/:id/blacklist","POST__PEER_ID_PARTNER":"/peer/:id/partner","POST__PEER_ID_UNDO_STATE":"/peer/:id/undo_state","POST__PEER_STATUS_":"/peer/status/","POST__SCHEDULER_ID":"/scheduler/:id","POST__SCHEDULER_LOKI_":"/scheduler/loki/","POST__WORKFLOW_":"/workflow/","POST__WORKFLOW_PUBLISH_ID":"/workflow/publish/:id","POST__WORKSPACE_":"/workspace/","PUT__CATALOG_COMPUTE_ID":"/catalog/compute/:id","PUT__CATALOG_DATA_ID":"/catalog/data/:id","PUT__CATALOG_PROCESSING_ID":"/catalog/processing/:id","PUT__CATALOG_STORAGE_ID":"/catalog/storage/:id","PUT__CATALOG_WORKFLOW_ID":"/catalog/workflow/:id","PUT__DATACENTER_BOOKING_ID":"/datacenter/booking/:id","PUT__PEER_ID":"/peer/:id","PUT__WORKFLOW_ID":"/workflow/:id","PUT__WORKSPACE_ID":"/workspace/:id","exp":1748268804},"id_token":{"client_id":"22487339-e0c5-4d87-9cf1-29ddaaed117e","groups":[],"peer_id":"c0cece97-7730-4c2a-8c20-a30944564106","signature":"FxGjKZdOns2UW36UwCKiy1xSGxLNZjP6tid2wCg6hGtsVc1gZ/o8tMpTZ7uR/9Km1c+cXLWkb6VjWrv6ShLAXCux/of4oJoR+IAQ/7BOxCmm2GXMR/SRvJmJvehgQpJbFq0lYPCBdFPc/6FBHCpcJjDFHinep6cHaNMkJKqrrhtQk837R+vrRSOLun7ySHNnUHmlqNfWGtgxOyTWfP9UgAOl/01BasVYxL9uM/Tl3LtGQc/q/i1X+B31JZDE9BDO9LEtRFV3MD2al6a4436LNJESHURIYuJzs4CgwRCSOMkd20lvo8e3n92Ja8/X8PcbIzwkeAsHySuN67mKF7Sxv+zN0foo/MQlJ4+0AoUXJnHwXYD5TILKsCMYaQSJulBUm362hN6F175dVpxvh6cPE7QMtdRfRoGiaus0x1JK7+WgWEK1T8nt+RPqVCxlNERKEUTkbktzFiOyr3QpI2VRhm1uCMTKPFHeA9W1L6zJ1ZqJ76NdrHEt5k8T2vlxHImfUmXQzTVeyz+q4wkMgeuLg39h8UlOmjJgwCZRLgyC+d2KiJRzSh57oj0h1nYpSOPWPDx1xqVCgedVYg5OoXlNgptMouFbbnbr3q0p+NVNajqwalfv3x19w0UlAokgjWz9OmMFBvOjHe8POy6bXNK1+/+Kp1Q7VP9FeFfJzeXN49w=","username":"admin"}}}" + +NB_EXEC=$1 +TIME=$2 + +if [ -z "$NB_EXEC" ]; then + NB_EXEC=1 +fi + +# if (( NB_EXEC % 10 != 0 )); then +# echo "Met un chiffre rond stp" +# exit 0 +# fi + +if [ -z "$TIME" ]; then + TIME=1 +fi + + +EXECS=$(((NB_EXEC+9) / 10)) +echo EXECS=$EXECS + +DAY=$(date +%d -u) +MONTH=$(date +%m -u) +HOUR=$(date +%H -u) +MINUTE=$(date -d "$TIME min" +"%M" -u) +SECOND=$(date +%s -u) + +start_loop=$(date +%s) + +for ((i = 1; i <= $EXECS; i++)); do + ( + start_req=$(date +%s) + + echo "Exec $i" + CRON="0-10 $MINUTE $HOUR $DAY $MONTH *" + echo "$CRON" + + START="2025-$MONTH-$DAY"T"$HOUR:$MINUTE:00.012Z" + + END_MONTH=$(printf "%02d" $((MONTH + 1))) + END="2025-$END_MONTH-$DAY"T"$HOUR:$MINUTE:00.012Z" + + # PAYLOAD=$(printf '{"id":null,"name":null,"cron":"","mode":1,"start":"%s","end":"%s"}' "$START" "$END") + PAYLOAD=$(printf '{"id":null,"name":null,"cron":"%s","mode":1,"start":"%s","end":"%s"}' "$CRON" "$START" "$END") + + # echo $PAYLOAD + + curl -X 'POST' 'http://localhost:8000/scheduler/62c55a70-e295-45e6-9925-a98137d59abc' \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -d "$PAYLOAD" \ + -H "Authorization: Bearer $TOKEN" -w '\n' + + end=$(date +%s) + duration=$((end - start_req)) + + echo "Début $start_req" + echo "Fin $end" + echo "Durée d'exécution $i : $duration secondes" + )& + +done + +wait + +end_loop=$(date +%s) +total_time=$((end_loop - start_loop)) +echo "Durée d'exécution total : $total_time secondes" diff --git a/docs/performance_test/performance_report.md b/docs/performance_test/performance_report.md new file mode 100644 index 0000000..e69de29