Fix data unification and efficiency calculations

- Unify Prometheus queries between namespace analysis and historical analysis
- Fix efficiency calculations to prevent division by zero
- Remove duplicate validations in validation service
- Improve frontend data display with clear numerical values
- Add proper error handling for missing data
This commit is contained in:
2025-10-01 14:43:43 -03:00
parent 6ad1997afd
commit ee20a09147
3 changed files with 26 additions and 48 deletions

View File

@@ -519,16 +519,16 @@ async def get_workload_historical_metrics(
for result in cluster_memory_data["data"]["result"]: for result in cluster_memory_data["data"]["result"]:
cluster_memory_total += float(result["value"][1]) cluster_memory_total += float(result["value"][1])
# Get workload-specific metrics using simpler queries # Get workload-specific metrics using more precise queries
# CPU usage for specific pod # CPU usage for specific pod (using exact pod name match)
cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~".*{workload}.*"}}[5m])' cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{workload}"}}[5m])'
memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~".*{workload}.*", container!="", image!=""}}' memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod="{workload}", container!="", image!=""}}'
# Resource requests and limits for specific pod # Resource requests and limits for specific pod
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="cpu"}})'
memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="memory"}})'
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="cpu"}})'
memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="memory"}})'
# Execute queries # Execute queries
cpu_usage_data = await prometheus_client.query(cpu_usage_query) cpu_usage_data = await prometheus_client.query(cpu_usage_query)
@@ -600,7 +600,8 @@ async def get_workload_historical_metrics(
"requests_cores": 0, "requests_cores": 0,
"requests_percent": 0, "requests_percent": 0,
"limits_cores": 0, "limits_cores": 0,
"limits_percent": 0 "limits_percent": 0,
"efficiency_percent": 0
}, },
"memory": { "memory": {
"usage_bytes": 0, "usage_bytes": 0,
@@ -611,7 +612,8 @@ async def get_workload_historical_metrics(
"requests_percent": 0, "requests_percent": 0,
"limits_bytes": 0, "limits_bytes": 0,
"limits_mb": 0, "limits_mb": 0,
"limits_percent": 0 "limits_percent": 0,
"efficiency_percent": 0
} }
} }
} }
@@ -624,12 +626,17 @@ async def get_workload_historical_metrics(
cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0 cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0 memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
# Calculate efficiency (usage vs requests)
cpu_efficiency = (cpu_usage / cpu_requests * 100) if cpu_requests > 0 else 0
memory_efficiency = (memory_usage / memory_requests * 100) if memory_requests > 0 else 0
return { return {
"workload": workload, "workload": workload,
"namespace": namespace, "namespace": namespace,
"time_range": time_range, "time_range": time_range,
"prometheus_available": True, "prometheus_available": True,
"data_source": "prometheus", "data_source": "prometheus",
"timestamp": datetime.now().isoformat(),
"cluster_total": { "cluster_total": {
"cpu_cores": cluster_cpu_total, "cpu_cores": cluster_cpu_total,
"memory_bytes": cluster_memory_total, "memory_bytes": cluster_memory_total,
@@ -642,7 +649,8 @@ async def get_workload_historical_metrics(
"requests_cores": cpu_requests, "requests_cores": cpu_requests,
"requests_percent": round(cpu_requests_percent, 2), "requests_percent": round(cpu_requests_percent, 2),
"limits_cores": cpu_limits, "limits_cores": cpu_limits,
"limits_percent": round(cpu_limits_percent, 2) "limits_percent": round(cpu_limits_percent, 2),
"efficiency_percent": round(cpu_efficiency, 1)
}, },
"memory": { "memory": {
"usage_bytes": memory_usage, "usage_bytes": memory_usage,
@@ -653,7 +661,8 @@ async def get_workload_historical_metrics(
"requests_percent": round(memory_requests_percent, 2), "requests_percent": round(memory_requests_percent, 2),
"limits_bytes": memory_limits, "limits_bytes": memory_limits,
"limits_mb": round(memory_limits / (1024**2), 2), "limits_mb": round(memory_limits / (1024**2), 2),
"limits_percent": round(memory_limits_percent, 2) "limits_percent": round(memory_limits_percent, 2),
"efficiency_percent": round(memory_efficiency, 1)
} }
} }
} }

View File

@@ -104,12 +104,7 @@ class ValidationService:
recommendation="Define limits to avoid excessive resource consumption" recommendation="Define limits to avoid excessive resource consumption"
)) ))
# 3. QoS Class validation based on Red Hat recommendations # 3. Validate limit:request ratio (only if both requests and limits exist)
qos_validation = self._validate_qos_class(pod_name, namespace, container["name"], qos_class, requests, limits)
if qos_validation:
validations.append(qos_validation)
# 3. Validate limit:request ratio
if requests and limits: if requests and limits:
cpu_validation = self._validate_cpu_ratio( cpu_validation = self._validate_cpu_ratio(
pod_name, namespace, container["name"], requests, limits pod_name, namespace, container["name"], requests, limits
@@ -123,7 +118,7 @@ class ValidationService:
if memory_validation: if memory_validation:
validations.append(memory_validation) validations.append(memory_validation)
# 4. Add container resource metrics validation # 4. Add container resource metrics validation (only if resources exist)
if requests or limits: if requests or limits:
metrics_validation = self._validate_container_metrics( metrics_validation = self._validate_container_metrics(
pod_name, namespace, container["name"], requests, limits pod_name, namespace, container["name"], requests, limits
@@ -131,7 +126,7 @@ class ValidationService:
if metrics_validation: if metrics_validation:
validations.append(metrics_validation) validations.append(metrics_validation)
# 5. Validate minimum values # 5. Validate minimum values (only if requests exist)
if requests: if requests:
min_validation = self._validate_minimum_values( min_validation = self._validate_minimum_values(
pod_name, namespace, container["name"], requests pod_name, namespace, container["name"], requests
@@ -387,32 +382,6 @@ class ValidationService:
else: else:
return "BestEffort" return "BestEffort"
def _validate_qos_class(self, pod_name: str, namespace: str, container_name: str, qos_class: str, requests: Dict[str, str], limits: Dict[str, str]) -> Optional[ResourceValidation]:
"""Validate QoS class and provide recommendations"""
cpu_requests = self._parse_cpu_value(requests.get("cpu", "0"))
memory_requests = self._parse_memory_value(requests.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB
cpu_limits = self._parse_cpu_value(limits.get("cpu", "0"))
memory_limits = self._parse_memory_value(limits.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB
# Check for missing requests (BestEffort pods) - removed duplicate validation
# This is already handled at container level in _validate_container_resources
# Check for missing limits (Burstable pods)
if qos_class == "Burstable" and (cpu_limits == 0 or memory_limits == 0):
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="missing_limits",
severity="warning",
message="Pod has requests but no limits defined",
recommendation="Define resource limits to prevent resource starvation",
priority_score=5,
workload_category="established",
estimated_impact="low"
)
return None
def validate_namespace_overcommit( def validate_namespace_overcommit(
self, self,

View File

@@ -1609,7 +1609,7 @@
</div> </div>
<div class="metric-row"> <div class="metric-row">
<span class="metric-label">Efficiency:</span> <span class="metric-label">Efficiency:</span>
<span class="metric-value ${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}">${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}</span> <span class="metric-value ${cpu.usage_cores > 0 && cpu.requests_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}">${cpu.usage_cores > 0 && cpu.requests_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}</span>
<span class="metric-percent">(usage vs requests)</span> <span class="metric-percent">(usage vs requests)</span>
</div> </div>
</div> </div>
@@ -1633,7 +1633,7 @@
</div> </div>
<div class="metric-row"> <div class="metric-row">
<span class="metric-label">Efficiency:</span> <span class="metric-label">Efficiency:</span>
<span class="metric-value ${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}">${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}</span> <span class="metric-value ${memory.usage_bytes > 0 && memory.requests_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}">${memory.usage_bytes > 0 && memory.requests_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}</span>
<span class="metric-percent">(usage vs requests)</span> <span class="metric-percent">(usage vs requests)</span>
</div> </div>
</div> </div>