Fix data unification and efficiency calculations
- Unify Prometheus queries between namespace analysis and historical analysis - Fix efficiency calculations to prevent division by zero - Remove duplicate validations in validation service - Improve frontend data display with clear numerical values - Add proper error handling for missing data
This commit is contained in:
@@ -519,16 +519,16 @@ async def get_workload_historical_metrics(
|
|||||||
for result in cluster_memory_data["data"]["result"]:
|
for result in cluster_memory_data["data"]["result"]:
|
||||||
cluster_memory_total += float(result["value"][1])
|
cluster_memory_total += float(result["value"][1])
|
||||||
|
|
||||||
# Get workload-specific metrics using simpler queries
|
# Get workload-specific metrics using more precise queries
|
||||||
# CPU usage for specific pod
|
# CPU usage for specific pod (using exact pod name match)
|
||||||
cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~".*{workload}.*"}}[5m])'
|
cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{workload}"}}[5m])'
|
||||||
memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~".*{workload}.*", container!="", image!=""}}'
|
memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod="{workload}", container!="", image!=""}}'
|
||||||
|
|
||||||
# Resource requests and limits for specific pod
|
# Resource requests and limits for specific pod
|
||||||
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})'
|
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="cpu"}})'
|
||||||
memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})'
|
memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod="{workload}", resource="memory"}})'
|
||||||
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})'
|
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="cpu"}})'
|
||||||
memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})'
|
memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod="{workload}", resource="memory"}})'
|
||||||
|
|
||||||
# Execute queries
|
# Execute queries
|
||||||
cpu_usage_data = await prometheus_client.query(cpu_usage_query)
|
cpu_usage_data = await prometheus_client.query(cpu_usage_query)
|
||||||
@@ -600,7 +600,8 @@ async def get_workload_historical_metrics(
|
|||||||
"requests_cores": 0,
|
"requests_cores": 0,
|
||||||
"requests_percent": 0,
|
"requests_percent": 0,
|
||||||
"limits_cores": 0,
|
"limits_cores": 0,
|
||||||
"limits_percent": 0
|
"limits_percent": 0,
|
||||||
|
"efficiency_percent": 0
|
||||||
},
|
},
|
||||||
"memory": {
|
"memory": {
|
||||||
"usage_bytes": 0,
|
"usage_bytes": 0,
|
||||||
@@ -611,7 +612,8 @@ async def get_workload_historical_metrics(
|
|||||||
"requests_percent": 0,
|
"requests_percent": 0,
|
||||||
"limits_bytes": 0,
|
"limits_bytes": 0,
|
||||||
"limits_mb": 0,
|
"limits_mb": 0,
|
||||||
"limits_percent": 0
|
"limits_percent": 0,
|
||||||
|
"efficiency_percent": 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -624,12 +626,17 @@ async def get_workload_historical_metrics(
|
|||||||
cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
||||||
memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
|
memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
|
||||||
|
|
||||||
|
# Calculate efficiency (usage vs requests)
|
||||||
|
cpu_efficiency = (cpu_usage / cpu_requests * 100) if cpu_requests > 0 else 0
|
||||||
|
memory_efficiency = (memory_usage / memory_requests * 100) if memory_requests > 0 else 0
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"workload": workload,
|
"workload": workload,
|
||||||
"namespace": namespace,
|
"namespace": namespace,
|
||||||
"time_range": time_range,
|
"time_range": time_range,
|
||||||
"prometheus_available": True,
|
"prometheus_available": True,
|
||||||
"data_source": "prometheus",
|
"data_source": "prometheus",
|
||||||
|
"timestamp": datetime.now().isoformat(),
|
||||||
"cluster_total": {
|
"cluster_total": {
|
||||||
"cpu_cores": cluster_cpu_total,
|
"cpu_cores": cluster_cpu_total,
|
||||||
"memory_bytes": cluster_memory_total,
|
"memory_bytes": cluster_memory_total,
|
||||||
@@ -642,7 +649,8 @@ async def get_workload_historical_metrics(
|
|||||||
"requests_cores": cpu_requests,
|
"requests_cores": cpu_requests,
|
||||||
"requests_percent": round(cpu_requests_percent, 2),
|
"requests_percent": round(cpu_requests_percent, 2),
|
||||||
"limits_cores": cpu_limits,
|
"limits_cores": cpu_limits,
|
||||||
"limits_percent": round(cpu_limits_percent, 2)
|
"limits_percent": round(cpu_limits_percent, 2),
|
||||||
|
"efficiency_percent": round(cpu_efficiency, 1)
|
||||||
},
|
},
|
||||||
"memory": {
|
"memory": {
|
||||||
"usage_bytes": memory_usage,
|
"usage_bytes": memory_usage,
|
||||||
@@ -653,7 +661,8 @@ async def get_workload_historical_metrics(
|
|||||||
"requests_percent": round(memory_requests_percent, 2),
|
"requests_percent": round(memory_requests_percent, 2),
|
||||||
"limits_bytes": memory_limits,
|
"limits_bytes": memory_limits,
|
||||||
"limits_mb": round(memory_limits / (1024**2), 2),
|
"limits_mb": round(memory_limits / (1024**2), 2),
|
||||||
"limits_percent": round(memory_limits_percent, 2)
|
"limits_percent": round(memory_limits_percent, 2),
|
||||||
|
"efficiency_percent": round(memory_efficiency, 1)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -104,12 +104,7 @@ class ValidationService:
|
|||||||
recommendation="Define limits to avoid excessive resource consumption"
|
recommendation="Define limits to avoid excessive resource consumption"
|
||||||
))
|
))
|
||||||
|
|
||||||
# 3. QoS Class validation based on Red Hat recommendations
|
# 3. Validate limit:request ratio (only if both requests and limits exist)
|
||||||
qos_validation = self._validate_qos_class(pod_name, namespace, container["name"], qos_class, requests, limits)
|
|
||||||
if qos_validation:
|
|
||||||
validations.append(qos_validation)
|
|
||||||
|
|
||||||
# 3. Validate limit:request ratio
|
|
||||||
if requests and limits:
|
if requests and limits:
|
||||||
cpu_validation = self._validate_cpu_ratio(
|
cpu_validation = self._validate_cpu_ratio(
|
||||||
pod_name, namespace, container["name"], requests, limits
|
pod_name, namespace, container["name"], requests, limits
|
||||||
@@ -123,7 +118,7 @@ class ValidationService:
|
|||||||
if memory_validation:
|
if memory_validation:
|
||||||
validations.append(memory_validation)
|
validations.append(memory_validation)
|
||||||
|
|
||||||
# 4. Add container resource metrics validation
|
# 4. Add container resource metrics validation (only if resources exist)
|
||||||
if requests or limits:
|
if requests or limits:
|
||||||
metrics_validation = self._validate_container_metrics(
|
metrics_validation = self._validate_container_metrics(
|
||||||
pod_name, namespace, container["name"], requests, limits
|
pod_name, namespace, container["name"], requests, limits
|
||||||
@@ -131,7 +126,7 @@ class ValidationService:
|
|||||||
if metrics_validation:
|
if metrics_validation:
|
||||||
validations.append(metrics_validation)
|
validations.append(metrics_validation)
|
||||||
|
|
||||||
# 5. Validate minimum values
|
# 5. Validate minimum values (only if requests exist)
|
||||||
if requests:
|
if requests:
|
||||||
min_validation = self._validate_minimum_values(
|
min_validation = self._validate_minimum_values(
|
||||||
pod_name, namespace, container["name"], requests
|
pod_name, namespace, container["name"], requests
|
||||||
@@ -387,32 +382,6 @@ class ValidationService:
|
|||||||
else:
|
else:
|
||||||
return "BestEffort"
|
return "BestEffort"
|
||||||
|
|
||||||
def _validate_qos_class(self, pod_name: str, namespace: str, container_name: str, qos_class: str, requests: Dict[str, str], limits: Dict[str, str]) -> Optional[ResourceValidation]:
|
|
||||||
"""Validate QoS class and provide recommendations"""
|
|
||||||
cpu_requests = self._parse_cpu_value(requests.get("cpu", "0"))
|
|
||||||
memory_requests = self._parse_memory_value(requests.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB
|
|
||||||
cpu_limits = self._parse_cpu_value(limits.get("cpu", "0"))
|
|
||||||
memory_limits = self._parse_memory_value(limits.get("memory", "0")) / (1024 * 1024 * 1024) # Convert to GB
|
|
||||||
|
|
||||||
# Check for missing requests (BestEffort pods) - removed duplicate validation
|
|
||||||
# This is already handled at container level in _validate_container_resources
|
|
||||||
|
|
||||||
# Check for missing limits (Burstable pods)
|
|
||||||
if qos_class == "Burstable" and (cpu_limits == 0 or memory_limits == 0):
|
|
||||||
return ResourceValidation(
|
|
||||||
pod_name=pod_name,
|
|
||||||
namespace=namespace,
|
|
||||||
container_name=container_name,
|
|
||||||
validation_type="missing_limits",
|
|
||||||
severity="warning",
|
|
||||||
message="Pod has requests but no limits defined",
|
|
||||||
recommendation="Define resource limits to prevent resource starvation",
|
|
||||||
priority_score=5,
|
|
||||||
workload_category="established",
|
|
||||||
estimated_impact="low"
|
|
||||||
)
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
def validate_namespace_overcommit(
|
def validate_namespace_overcommit(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -1609,7 +1609,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="metric-row">
|
<div class="metric-row">
|
||||||
<span class="metric-label">Efficiency:</span>
|
<span class="metric-label">Efficiency:</span>
|
||||||
<span class="metric-value ${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}">${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}</span>
|
<span class="metric-value ${cpu.usage_cores > 0 && cpu.requests_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}">${cpu.usage_cores > 0 && cpu.requests_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}</span>
|
||||||
<span class="metric-percent">(usage vs requests)</span>
|
<span class="metric-percent">(usage vs requests)</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -1633,7 +1633,7 @@
|
|||||||
</div>
|
</div>
|
||||||
<div class="metric-row">
|
<div class="metric-row">
|
||||||
<span class="metric-label">Efficiency:</span>
|
<span class="metric-label">Efficiency:</span>
|
||||||
<span class="metric-value ${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}">${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}</span>
|
<span class="metric-value ${memory.usage_bytes > 0 && memory.requests_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}">${memory.usage_bytes > 0 && memory.requests_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}</span>
|
||||||
<span class="metric-percent">(usage vs requests)</span>
|
<span class="metric-percent">(usage vs requests)</span>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
Reference in New Issue
Block a user