diff --git a/app/api/routes.py b/app/api/routes.py index f2ae479..e5e5cd0 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -519,16 +519,16 @@ async def get_workload_historical_metrics( for result in cluster_memory_data["data"]["result"]: cluster_memory_total += float(result["value"][1]) - # Get workload-specific metrics using simpler queries - # CPU usage for specific pod - cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~".*{workload}.*"}}[5m])' - memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~".*{workload}.*", container!="", image!=""}}' + # Get workload-specific metrics using more precise queries + # CPU usage for specific pod (using exact pod name match) + cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{workload}"}}[5m])' + memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod="{workload}", container!="", image!=""}}' # Resource requests and limits for specific pod - cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' - memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' - cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' - memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' + cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="cpu"}})' + memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="memory"}})' + cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="cpu"}})' + memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="memory"}})' # Execute queries cpu_usage_data = await prometheus_client.query(cpu_usage_query) @@ -600,7 +600,8 @@ async def get_workload_historical_metrics( "requests_cores": 0, "requests_percent": 0, "limits_cores": 0, - "limits_percent": 0 + "limits_percent": 0, + "efficiency_percent": 0 }, "memory": { "usage_bytes": 0, @@ -611,7 +612,8 @@ async def get_workload_historical_metrics( "requests_percent": 0, "limits_bytes": 0, "limits_mb": 0, - "limits_percent": 0 + "limits_percent": 0, + "efficiency_percent": 0 } } } @@ -624,12 +626,17 @@ async def get_workload_historical_metrics( cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0 memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0 + # Calculate efficiency (usage vs requests) + cpu_efficiency = (cpu_usage / cpu_requests * 100) if cpu_requests > 0 else 0 + memory_efficiency = (memory_usage / memory_requests * 100) if memory_requests > 0 else 0 + return { "workload": workload, "namespace": namespace, "time_range": time_range, "prometheus_available": True, "data_source": "prometheus", + "timestamp": datetime.now().isoformat(), "cluster_total": { "cpu_cores": cluster_cpu_total, "memory_bytes": cluster_memory_total, @@ -642,7 +649,8 @@ async def get_workload_historical_metrics( "requests_cores": cpu_requests, "requests_percent": round(cpu_requests_percent, 2), "limits_cores": cpu_limits, - "limits_percent": round(cpu_limits_percent, 2) + "limits_percent": round(cpu_limits_percent, 2), + "efficiency_percent": round(cpu_efficiency, 1) }, "memory": { "usage_bytes": memory_usage, @@ -653,7 +661,8 @@ async def get_workload_historical_metrics( "requests_percent": round(memory_requests_percent, 2), "limits_bytes": memory_limits, "limits_mb": round(memory_limits / (1024**2), 2), - "limits_percent": round(memory_limits_percent, 2) + "limits_percent": round(memory_limits_percent, 2), + "efficiency_percent": round(memory_efficiency, 1) } } } diff --git a/app/services/validation_service.py b/app/services/validation_service.py index 7379f5c..9bd702f 100644 --- a/app/services/validation_service.py +++ b/app/services/validation_service.py @@ -104,12 +104,7 @@ class ValidationService: recommendation="Define limits to avoid excessive resource consumption" )) - # 3. QoS Class validation based on Red Hat recommendations - qos_validation = self._validate_qos_class(pod_name, namespace, container["name"], qos_class, requests, limits) - if qos_validation: - validations.append(qos_validation) - - # 3. Validate limit:request ratio + # 3. Validate limit:request ratio (only if both requests and limits exist) if requests and limits: cpu_validation = self._validate_cpu_ratio( pod_name, namespace, container["name"], requests, limits @@ -123,7 +118,7 @@ class ValidationService: if memory_validation: validations.append(memory_validation) - # 4. Add container resource metrics validation + # 4. Add container resource metrics validation (only if resources exist) if requests or limits: metrics_validation = self._validate_container_metrics( pod_name, namespace, container["name"], requests, limits @@ -131,7 +126,7 @@ class ValidationService: if metrics_validation: validations.append(metrics_validation) - # 5. Validate minimum values + # 5. Validate minimum values (only if requests exist) if requests: min_validation = self._validate_minimum_values( pod_name, namespace, container["name"], requests @@ -387,32 +382,6 @@ class ValidationService: else: return "BestEffort" - def _validate_qos_class(self, pod_name: str, namespace: str, container_name: str, qos_class: str, requests: Dict[str, str], limits: Dict[str, str]) -> Optional[ResourceValidation]: - """Validate QoS class and provide recommendations""" - cpu_requests = self._parse_cpu_value(requests.get("cpu", "0")) - memory_requests = self._parse_memory_value(requests.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB - cpu_limits = self._parse_cpu_value(limits.get("cpu", "0")) - memory_limits = self._parse_memory_value(limits.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB - - # Check for missing requests (BestEffort pods) - removed duplicate validation - # This is already handled at container level in _validate_container_resources - - # Check for missing limits (Burstable pods) - if qos_class == "Burstable" and (cpu_limits == 0 or memory_limits == 0): - return ResourceValidation( - pod_name=pod_name, - namespace=namespace, - container_name=container_name, - validation_type="missing_limits", - severity="warning", - message="Pod has requests but no limits defined", - recommendation="Define resource limits to prevent resource starvation", - priority_score=5, - workload_category="established", - estimated_impact="low" - ) - - return None def validate_namespace_overcommit( self, diff --git a/app/static/index.html b/app/static/index.html index f26429f..645eaf9 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -1609,7 +1609,7 @@
Efficiency: - ${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'} + ${cpu.usage_cores > 0 && cpu.requests_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'} (usage vs requests)
@@ -1633,7 +1633,7 @@
Efficiency: - ${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'} + ${memory.usage_bytes > 0 && memory.requests_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'} (usage vs requests)