Fix historical analysis contradictions and implement workload-based analysis

- Fix insufficient_historical_data vs historical_analysis contradiction
- Add return statement when insufficient data to prevent P99 calculation
- Implement workload-based historical analysis instead of pod-based
- Add _extract_workload_name() to identify workload from pod names
- Add analyze_workload_historical_usage() for workload-level analysis
- Add _analyze_workload_metrics() with Prometheus workload queries
- Add validate_workload_resources_with_historical_analysis() method
- Update /cluster/status endpoint to use workload analysis by namespace
- Improve reliability by analyzing workloads instead of individual pods
- Maintain fallback to pod-level analysis if workload analysis fails
This commit is contained in:
2025-10-01 16:32:12 -03:00
parent 6f5c8b0cac
commit 4721a1ef37
3 changed files with 280 additions and 11 deletions

View File

@@ -45,22 +45,39 @@ async def get_cluster_status(
pods = await k8s_client.get_all_pods()
nodes_info = await k8s_client.get_nodes_info()
# Validate resources with historical analysis (includes static validations)
# Validate resources with historical analysis by workload (more reliable)
all_validations = []
# Group pods by namespace for workload analysis
namespace_pods = {}
for pod in pods:
# Historical analysis includes static validations
if pod.namespace not in namespace_pods:
namespace_pods[pod.namespace] = []
namespace_pods[pod.namespace].append(pod)
# Analyze each namespace's workloads
for namespace, namespace_pod_list in namespace_pods.items():
try:
historical_validations = await validation_service.validate_pod_resources_with_historical_analysis(pod, "24h")
all_validations.extend(historical_validations)
# Use workload-based analysis (more reliable than individual pods)
workload_validations = await validation_service.validate_workload_resources_with_historical_analysis(
namespace_pod_list, "24h"
)
all_validations.extend(workload_validations)
except Exception as e:
logger.warning(f"Error in historical analysis for pod {pod.name}: {e}")
# Fallback to static validations only if historical analysis fails
try:
static_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(static_validations)
except Exception as static_e:
logger.error(f"Error in static validation for pod {pod.name}: {static_e}")
logger.warning(f"Error in workload analysis for namespace {namespace}: {e}")
# Fallback to individual pod analysis
for pod in namespace_pod_list:
try:
pod_validations = await validation_service.validate_pod_resources_with_historical_analysis(pod, "24h")
all_validations.extend(pod_validations)
except Exception as pod_e:
logger.warning(f"Error in historical analysis for pod {pod.name}: {pod_e}")
# Final fallback to static validations only
try:
static_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(static_validations)
except Exception as static_e:
logger.error(f"Error in static validation for pod {pod.name}: {static_e}")
# Get overcommit information
overcommit_info = await prometheus_client.get_cluster_overcommit()