diff --git a/app/api/routes.py b/app/api/routes.py index 60485b6..700e75d 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -55,16 +55,74 @@ async def get_cluster_status( # Get VPA recommendations vpa_recommendations = await k8s_client.get_vpa_recommendations() - # Generate report - report = report_service.generate_cluster_report( - pods=pods, - validations=all_validations, - vpa_recommendations=vpa_recommendations, - overcommit_info=overcommit_info, - nodes_info=nodes_info - ) + # Group pods by namespace for the frontend + namespaces_data = {} + for pod in pods: + namespace = pod.namespace + if namespace not in namespaces_data: + namespaces_data[namespace] = { + 'namespace': namespace, + 'pods': {}, + 'total_validations': 0, + 'severity_breakdown': {'error': 0, 'warning': 0, 'info': 0} + } + + # Add pod to namespace + pod_name = pod.name + pod_validations = validation_service.validate_pod_resources(pod) + + # Convert pod to the format expected by frontend + pod_data = { + 'pod_name': pod_name, + 'namespace': namespace, + 'phase': pod.phase, + 'node_name': pod.node_name, + 'containers': [], + 'validations': [] + } + + # Add containers + for container in pod.containers: + container_data = { + 'name': container['name'], + 'image': container['image'], + 'resources': container['resources'] + } + pod_data['containers'].append(container_data) + + # Add validations for this pod + for validation in pod_validations: + validation_data = { + 'rule_name': validation.validation_type, + 'namespace': namespace, + 'message': validation.message, + 'recommendation': validation.recommendation, + 'severity': validation.severity + } + pod_data['validations'].append(validation_data) + + # Update namespace severity breakdown + namespaces_data[namespace]['severity_breakdown'][validation.severity] += 1 + namespaces_data[namespace]['total_validations'] += 1 + + namespaces_data[namespace]['pods'][pod_name] = pod_data - return report + # Convert to list format expected by frontend + namespaces_list = list(namespaces_data.values()) + + # Count total errors and warnings + total_errors = sum(ns['severity_breakdown']['error'] for ns in namespaces_list) + total_warnings = sum(ns['severity_breakdown']['warning'] for ns in namespaces_list) + + return { + "timestamp": datetime.now().isoformat(), + "total_pods": len(pods), + "total_namespaces": len(namespaces_list), + "total_nodes": len(nodes_info) if nodes_info else 0, + "total_errors": total_errors, + "total_warnings": total_warnings, + "namespaces": namespaces_list + } except Exception as e: logger.error(f"Error getting cluster status: {e}") @@ -449,6 +507,34 @@ async def get_namespace_historical_analysis( logger.error(f"Error getting historical analysis for namespace {namespace}: {e}") raise HTTPException(status_code=500, detail=str(e)) +@api_router.get("/namespace/{namespace}/workload/{workload}/historical-analysis") +async def get_workload_historical_analysis( + namespace: str, + workload: str, + time_range: str = "24h", + prometheus_client=Depends(get_prometheus_client) +): + """Get historical analysis for a specific workload/deployment""" + try: + historical_service = HistoricalAnalysisService() + + # Get historical analysis for the workload + analysis = await historical_service.get_workload_historical_analysis( + namespace, workload, time_range, prometheus_client + ) + + return { + "namespace": namespace, + "workload": workload, + "time_range": time_range, + "analysis": analysis, + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting historical analysis for workload {workload} in namespace {namespace}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + @api_router.get("/namespace/{namespace}/pod/{pod_name}/historical-analysis") async def get_pod_historical_analysis( namespace: str, @@ -456,7 +542,7 @@ async def get_pod_historical_analysis( time_range: str = "24h", prometheus_client=Depends(get_prometheus_client) ): - """Get historical analysis for a specific pod""" + """Get historical analysis for a specific pod (legacy endpoint)""" try: historical_service = HistoricalAnalysisService() diff --git a/app/services/historical_analysis.py b/app/services/historical_analysis.py index 5403647..762bfa7 100644 --- a/app/services/historical_analysis.py +++ b/app/services/historical_analysis.py @@ -570,54 +570,224 @@ class HistoricalAnalysisService: 'recommendations': [] } + async def get_workload_historical_analysis(self, namespace: str, workload: str, time_range: str, prometheus_client): + """Get historical analysis for a specific workload/deployment""" + try: + logger.info(f"Getting historical analysis for workload: {workload} in namespace: {namespace}") + + # Query for CPU usage by workload (aggregated by workload) + cpu_query = f''' + sum( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{{ + cluster="", + namespace="{namespace}" + }} + * on(namespace,pod) + group_left(workload, workload_type) + namespace_workload_pod:kube_pod_owner:relabel{{ + cluster="", + namespace="{namespace}", + workload="{workload}", + workload_type=~".+" + }} + ) by (workload, workload_type) + ''' + + # Query for memory usage by workload (aggregated by workload) + memory_query = f''' + sum( + container_memory_working_set_bytes{{ + namespace="{namespace}", + container!="POD", + container!="" + }} + * on(namespace,pod) + group_left(workload, workload_type) + namespace_workload_pod:kube_pod_owner:relabel{{ + cluster="", + namespace="{namespace}", + workload="{workload}", + workload_type=~".+" + }} + ) by (workload, workload_type) + ''' + + # Query for CPU requests by namespace (using resource quota) + cpu_requests_query = f''' + scalar(kube_resourcequota{{ + cluster="", + namespace="{namespace}", + type="hard", + resource="requests.cpu" + }}) + ''' + + # Query for memory requests by namespace (using resource quota) + memory_requests_query = f''' + scalar(kube_resourcequota{{ + cluster="", + namespace="{namespace}", + type="hard", + resource="requests.memory" + }}) + ''' + + # Query for CPU limits by namespace (using resource quota) + cpu_limits_query = f''' + scalar(kube_resourcequota{{ + cluster="", + namespace="{namespace}", + type="hard", + resource="limits.cpu" + }}) + ''' + + # Query for memory limits by namespace (using resource quota) + memory_limits_query = f''' + scalar(kube_resourcequota{{ + cluster="", + namespace="{namespace}", + type="hard", + resource="limits.memory" + }}) + ''' + + # Execute queries + cpu_usage = await self._query_prometheus(cpu_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + memory_usage = await self._query_prometheus(memory_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + cpu_requests = await self._query_prometheus(cpu_requests_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + memory_requests = await self._query_prometheus(memory_requests_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + cpu_limits = await self._query_prometheus(cpu_limits_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + memory_limits = await self._query_prometheus(memory_limits_query, + datetime.now() - timedelta(seconds=self.time_ranges[time_range]), + datetime.now()) + + # Calculate utilization percentages + cpu_utilization = 0 + memory_utilization = 0 + + if cpu_usage and cpu_requests and cpu_requests[0][1] != '0': + cpu_utilization = (float(cpu_usage[0][1]) / float(cpu_requests[0][1])) * 100 + + if memory_usage and memory_requests and memory_requests[0][1] != '0': + memory_utilization = (float(memory_usage[0][1]) / float(memory_requests[0][1])) * 100 + + # Generate recommendations based on utilization + recommendations = [] + + if cpu_utilization > 80: + recommendations.append({ + "type": "cpu_high_utilization", + "severity": "warning", + "message": f"High CPU utilization: {cpu_utilization:.1f}%", + "recommendation": "Consider increasing CPU requests or optimizing application performance" + }) + elif cpu_utilization < 20 and cpu_utilization > 0: + recommendations.append({ + "type": "cpu_low_utilization", + "severity": "info", + "message": f"Low CPU utilization: {cpu_utilization:.1f}%", + "recommendation": "Consider reducing CPU requests to optimize resource allocation" + }) + + if memory_utilization > 80: + recommendations.append({ + "type": "memory_high_utilization", + "severity": "warning", + "message": f"High memory utilization: {memory_utilization:.1f}%", + "recommendation": "Consider increasing memory requests or optimizing memory usage" + }) + elif memory_utilization < 20 and memory_utilization > 0: + recommendations.append({ + "type": "memory_low_utilization", + "severity": "info", + "message": f"Low memory utilization: {memory_utilization:.1f}%", + "recommendation": "Consider reducing memory requests to optimize resource allocation" + }) + + return { + 'namespace': namespace, + 'workload': workload, + 'time_range': time_range, + 'cpu_usage': float(cpu_usage[0][1]) if cpu_usage else 0, + 'memory_usage': float(memory_usage[0][1]) if memory_usage else 0, + 'cpu_requests': float(cpu_requests[0][1]) if cpu_requests else 0, + 'memory_requests': float(memory_requests[0][1]) if memory_requests else 0, + 'cpu_limits': float(cpu_limits[0][1]) if cpu_limits else 0, + 'memory_limits': float(memory_limits[0][1]) if memory_limits else 0, + 'cpu_utilization': cpu_utilization, + 'memory_utilization': memory_utilization, + 'recommendations': recommendations + } + + except Exception as e: + logger.error(f"Error getting historical analysis for workload {workload} in namespace {namespace}: {e}") + return { + 'namespace': namespace, + 'workload': workload, + 'time_range': time_range, + 'error': str(e), + 'recommendations': [] + } + async def get_pod_historical_analysis(self, namespace: str, pod_name: str, time_range: str, prometheus_client): """Get historical analysis for a specific pod""" try: logger.info(f"Getting historical analysis for pod: {pod_name} in namespace: {namespace}") - # Query for CPU usage by pod + # Query for CPU usage by pod (more generic query) cpu_query = f''' sum(rate(container_cpu_usage_seconds_total{{ namespace="{namespace}", - pod="{pod_name}", + pod=~"{pod_name}.*", container!="POD", container!="" }}[{time_range}])) ''' - # Query for memory usage by pod + # Query for memory usage by pod (more generic query) memory_query = f''' sum(container_memory_working_set_bytes{{ namespace="{namespace}", - pod="{pod_name}", + pod=~"{pod_name}.*", container!="POD", container!="" }}) ''' - # Query for CPU requests by pod + # Query for CPU requests by pod (more generic query) cpu_requests_query = f''' sum(kube_pod_container_resource_requests{{ namespace="{namespace}", - pod="{pod_name}", + pod=~"{pod_name}.*", resource="cpu" }}) ''' - # Query for memory requests by pod + # Query for memory requests by pod (more generic query) memory_requests_query = f''' sum(kube_pod_container_resource_requests{{ namespace="{namespace}", - pod="{pod_name}", + pod=~"{pod_name}.*", resource="memory" }}) ''' - # Query for container count by pod + # Query for container count by pod (more generic query) container_count_query = f''' count(container_memory_working_set_bytes{{ namespace="{namespace}", - pod="{pod_name}", + pod=~"{pod_name}.*", container!="POD", container!="" }}) @@ -626,19 +796,19 @@ class HistoricalAnalysisService: # Execute queries cpu_usage = await self._query_prometheus(cpu_query, datetime.now() - timedelta(seconds=self.time_ranges[time_range]), - datetime.now(), prometheus_client) + datetime.now()) memory_usage = await self._query_prometheus(memory_query, datetime.now() - timedelta(seconds=self.time_ranges[time_range]), - datetime.now(), prometheus_client) + datetime.now()) cpu_requests = await self._query_prometheus(cpu_requests_query, datetime.now() - timedelta(seconds=self.time_ranges[time_range]), - datetime.now(), prometheus_client) + datetime.now()) memory_requests = await self._query_prometheus(memory_requests_query, datetime.now() - timedelta(seconds=self.time_ranges[time_range]), - datetime.now(), prometheus_client) + datetime.now()) container_count = await self._query_prometheus(container_count_query, datetime.now() - timedelta(seconds=self.time_ranges[time_range]), - datetime.now(), prometheus_client) + datetime.now()) # Calculate utilization percentages cpu_utilization = 0 diff --git a/app/static/index.html b/app/static/index.html index 8041043..36b4ac1 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -153,6 +153,34 @@ margin-top: 1.5rem; } + .time-range-selector { + display: flex; + gap: 0.5rem; + margin-bottom: 1rem; + padding: 1rem; + background: #f8f9fa; + border-radius: 0.5rem; + } + + .time-range-btn { + padding: 0.5rem 1rem; + border: 1px solid #ddd; + background: white; + border-radius: 0.25rem; + cursor: pointer; + transition: all 0.2s; + } + + .time-range-btn:hover { + background: #e9ecef; + } + + .time-range-btn.active { + background: #007bff; + color: white; + border-color: #007bff; + } + .page-header { padding: 0 1rem; } @@ -938,14 +966,21 @@ -
${data.analysis.error}
+Namespace: ${analysis.namespace}
+Time Range: ${analysis.time_range}
+${rec.recommendation}
+