From f3b802222438ac69b524513107941999d8fb4eb1 Mon Sep 17 00:00:00 2001 From: andersonid Date: Tue, 30 Sep 2025 16:48:31 -0300 Subject: [PATCH] Phase 1.2: Complete Historical Analysis Integration - Add insufficient data detection, seasonal patterns, and integrate in main dashboard --- app/api/routes.py | 12 ++- app/services/historical_analysis.py | 148 +++++++++++++++++++++++++++- 2 files changed, 157 insertions(+), 3 deletions(-) diff --git a/app/api/routes.py b/app/api/routes.py index 350aec9..5a825b1 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -44,11 +44,21 @@ async def get_cluster_status( pods = await k8s_client.get_all_pods() nodes_info = await k8s_client.get_nodes_info() - # Validate resources + # Validate resources with historical analysis all_validations = [] + historical_service = HistoricalAnalysisService() + for pod in pods: + # Static validations pod_validations = validation_service.validate_pod_resources(pod) all_validations.extend(pod_validations) + + # Historical analysis (async) + try: + historical_validations = await validation_service.validate_pod_resources_with_historical_analysis(pod, "24h") + all_validations.extend(historical_validations) + except Exception as e: + logger.warning(f"Error in historical analysis for pod {pod.name}: {e}") # Get overcommit information overcommit_info = await prometheus_client.get_cluster_overcommit() diff --git a/app/services/historical_analysis.py b/app/services/historical_analysis.py index a4b703a..4f1acc7 100644 --- a/app/services/historical_analysis.py +++ b/app/services/historical_analysis.py @@ -197,6 +197,76 @@ class HistoricalAnalysisService: return validations + def _detect_seasonal_patterns( + self, + pod_name: str, + namespace: str, + container_name: str, + usage_values: List[float], + time_range: str + ) -> List[ResourceValidation]: + """Detect seasonal patterns and trends in resource usage""" + validations = [] + + if len(usage_values) < 20: # Need at least 20 data points for pattern detection + return validations + + # Calculate trend (simple linear regression) + n = len(usage_values) + x = list(range(n)) + y = usage_values + + # Calculate slope + x_mean = sum(x) / n + y_mean = sum(y) / n + + numerator = sum((x[i] - x_mean) * (y[i] - y_mean) for i in range(n)) + denominator = sum((x[i] - x_mean) ** 2 for i in range(n)) + + if denominator != 0: + slope = numerator / denominator + + # Detect significant trends + if slope > 0.1: # Increasing trend + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="seasonal_pattern", + severity="info", + message=f"Detected increasing resource usage trend over {time_range}", + recommendation="Monitor for continued growth and consider proactive scaling" + )) + elif slope < -0.1: # Decreasing trend + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="seasonal_pattern", + severity="info", + message=f"Detected decreasing resource usage trend over {time_range}", + recommendation="Consider reducing resource requests/limits if trend continues" + )) + + # Detect high variability (coefficient of variation > 50%) + if y_mean > 0: + variance = sum((y[i] - y_mean) ** 2 for i in range(n)) / n + std_dev = variance ** 0.5 + cv = std_dev / y_mean + + if cv > 0.5: # High variability + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="seasonal_pattern", + severity="warning", + message=f"High resource usage variability detected (CV: {cv:.2f})", + recommendation="Consider higher safety margins for requests/limits due to unpredictable usage" + )) + + return validations + def _analyze_cpu_metrics( self, pod_name: str, @@ -210,14 +280,45 @@ class HistoricalAnalysisService: """Analyze CPU metrics""" validations = [] - if not usage_data or not requests_data: + # Check for insufficient historical data + if not usage_data: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="info", + message=f"No CPU usage data available for {time_range}", + recommendation="Monitor workload for at least 24h to get reliable resource recommendations" + )) return validations # Calculate usage statistics usage_values = [float(point[1]) for point in usage_data if point[1] != 'NaN'] if not usage_values: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="info", + message=f"No valid CPU usage data points for {time_range}", + recommendation="Check if pod is running and generating metrics" + )) return validations + # Check for minimal data points (less than 10 data points) + if len(usage_values) < 10: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="warning", + message=f"Limited CPU usage data ({len(usage_values)} points) for {time_range}", + recommendation="Wait for more data points or extend time range for reliable analysis" + )) + # Current values of requests/limits current_requests = self._safe_float(requests_data[0][1]) if requests_data else 0 current_limits = self._safe_float(limits_data[0][1]) if limits_data else 0 @@ -228,6 +329,12 @@ class HistoricalAnalysisService: p95_usage = sorted(usage_values)[int(len(usage_values) * 0.95)] p99_usage = sorted(usage_values)[int(len(usage_values) * 0.99)] + # Detect seasonal patterns + seasonal_validations = self._detect_seasonal_patterns( + pod_name, namespace, container_name, usage_values, time_range + ) + validations.extend(seasonal_validations) + # Request adequacy analysis if current_requests > 0: # Request too high (average usage < 50% of request) @@ -295,14 +402,45 @@ class HistoricalAnalysisService: """Analyze memory metrics""" validations = [] - if not usage_data or not requests_data: + # Check for insufficient historical data + if not usage_data: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="info", + message=f"No memory usage data available for {time_range}", + recommendation="Monitor workload for at least 24h to get reliable resource recommendations" + )) return validations # Calculate usage statistics usage_values = [float(point[1]) for point in usage_data if point[1] != 'NaN'] if not usage_values: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="info", + message=f"No valid memory usage data points for {time_range}", + recommendation="Check if pod is running and generating metrics" + )) return validations + # Check for minimal data points (less than 10 data points) + if len(usage_values) < 10: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="insufficient_historical_data", + severity="warning", + message=f"Limited memory usage data ({len(usage_values)} points) for {time_range}", + recommendation="Wait for more data points or extend time range for reliable analysis" + )) + # Current values of requests/limits (in bytes) current_requests = self._safe_float(requests_data[0][1]) if requests_data else 0 current_limits = self._safe_float(limits_data[0][1]) if limits_data else 0 @@ -313,6 +451,12 @@ class HistoricalAnalysisService: p95_usage = sorted(usage_values)[int(len(usage_values) * 0.95)] p99_usage = sorted(usage_values)[int(len(usage_values) * 0.99)] + # Detect seasonal patterns + seasonal_validations = self._detect_seasonal_patterns( + pod_name, namespace, container_name, usage_values, time_range + ) + validations.extend(seasonal_validations) + # Convert to MiB for better readability def bytes_to_mib(bytes_value): return bytes_value / (1024 * 1024)