diff --git a/app/api/routes.py b/app/api/routes.py index 5f1b916..f2ae479 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -498,127 +498,123 @@ async def get_workload_historical_metrics( """Get historical metrics for a specific workload with cluster percentages""" try: prometheus_client = PrometheusClient() + await prometheus_client.initialize() - # Get current usage using OpenShift-specific metrics - cpu_usage_query = f'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{{cluster="", namespace="{namespace}"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - memory_usage_query = f'sum(container_memory_working_set_bytes{{job="kubelet", metrics_path="/metrics/cadvisor", cluster="", namespace="{namespace}", container!="", image!=""}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - - cpu_usage_data = await prometheus_client.query(cpu_usage_query) - memory_usage_data = await prometheus_client.query(memory_usage_query) - - # Get resource requests and limits using OpenShift-specific metrics - cpu_requests_query = f'sum(kube_pod_container_resource_requests{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="cpu"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - memory_requests_query = f'sum(kube_pod_container_resource_requests{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="memory"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - - cpu_requests_data = await prometheus_client.query(cpu_requests_query) - memory_requests_data = await prometheus_client.query(memory_requests_query) - - cpu_limits_query = f'sum(kube_pod_container_resource_limits{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="cpu"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - memory_limits_query = f'sum(kube_pod_container_resource_limits{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="memory"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)' - - cpu_limits_data = await prometheus_client.query(cpu_limits_query) - memory_limits_data = await prometheus_client.query(memory_limits_query) - - # Get cluster total resources + # Get cluster total resources first cluster_cpu_query = 'sum(kube_node_status_allocatable{resource="cpu"})' cluster_memory_query = 'sum(kube_node_status_allocatable{resource="memory"})' cluster_cpu_data = await prometheus_client.query(cluster_cpu_query) cluster_memory_data = await prometheus_client.query(cluster_memory_query) - # Extract values from OpenShift-specific queries + # Extract cluster totals + cluster_cpu_total = 0 + cluster_memory_total = 0 + + if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"): + for result in cluster_cpu_data["data"]["result"]: + cluster_cpu_total += float(result["value"][1]) + + if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"): + for result in cluster_memory_data["data"]["result"]: + cluster_memory_total += float(result["value"][1]) + + # Get workload-specific metrics using simpler queries + # CPU usage for specific pod + cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~".*{workload}.*"}}[5m])' + memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~".*{workload}.*", container!="", image!=""}}' + + # Resource requests and limits for specific pod + cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' + memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' + cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})' + memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})' + + # Execute queries + cpu_usage_data = await prometheus_client.query(cpu_usage_query) + memory_usage_data = await prometheus_client.query(memory_usage_query) + cpu_requests_data = await prometheus_client.query(cpu_requests_query) + memory_requests_data = await prometheus_client.query(memory_requests_query) + cpu_limits_data = await prometheus_client.query(cpu_limits_query) + memory_limits_data = await prometheus_client.query(memory_limits_query) + + # Extract values cpu_usage = 0 memory_usage = 0 cpu_requests = 0 memory_requests = 0 cpu_limits = 0 memory_limits = 0 - cluster_cpu_total = 0 - cluster_memory_total = 0 - # Check if we got any data from Prometheus - prometheus_available = False - - # Extract CPU usage from workload-specific query + # Extract CPU usage if cpu_usage_data.get("status") == "success" and cpu_usage_data.get("data", {}).get("result"): for result in cpu_usage_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - cpu_usage = float(result["value"][1]) - break + cpu_usage += float(result["value"][1]) - # Extract Memory usage from workload-specific query + # Extract Memory usage if memory_usage_data.get("status") == "success" and memory_usage_data.get("data", {}).get("result"): for result in memory_usage_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - memory_usage = float(result["value"][1]) - break + memory_usage += float(result["value"][1]) - # Extract CPU requests from workload-specific query + # Extract CPU requests if cpu_requests_data.get("status") == "success" and cpu_requests_data.get("data", {}).get("result"): for result in cpu_requests_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - cpu_requests = float(result["value"][1]) - break + cpu_requests += float(result["value"][1]) - # Extract Memory requests from workload-specific query + # Extract Memory requests if memory_requests_data.get("status") == "success" and memory_requests_data.get("data", {}).get("result"): for result in memory_requests_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - memory_requests = float(result["value"][1]) - break + memory_requests += float(result["value"][1]) - # Extract CPU limits from workload-specific query + # Extract CPU limits if cpu_limits_data.get("status") == "success" and cpu_limits_data.get("data", {}).get("result"): for result in cpu_limits_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - cpu_limits = float(result["value"][1]) - break + cpu_limits += float(result["value"][1]) - # Extract Memory limits from workload-specific query + # Extract Memory limits if memory_limits_data.get("status") == "success" and memory_limits_data.get("data", {}).get("result"): for result in memory_limits_data["data"]["result"]: - if result.get("metric", {}).get("workload") == workload: - memory_limits = float(result["value"][1]) - break + memory_limits += float(result["value"][1]) - if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"): - cluster_cpu_total = float(cluster_cpu_data["data"]["result"][0]["value"][1]) + # Check if we have real data + prometheus_available = cluster_cpu_total > 0 and cluster_memory_total > 0 - if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"): - cluster_memory_total = float(cluster_memory_data["data"]["result"][0]["value"][1]) - - # Check if Prometheus is available (any non-zero values) - if cluster_cpu_total > 0 or cluster_memory_total > 0: - prometheus_available = True - - # If Prometheus is not available, provide simulated data for demonstration + # If no real data, return zeros with appropriate message if not prometheus_available: - # Simulate cluster resources (typical OpenShift cluster) - cluster_cpu_total = 24.0 # 6 nodes * 4 cores each - cluster_memory_total = 96.0 * (1024**3) # 6 nodes * 16GB each - - # Simulate workload metrics based on namespace - if namespace == "resource-governance": - cpu_usage = 0.05 - memory_usage = 128 * (1024**2) # 128MB - cpu_requests = 0.1 - memory_requests = 128 * (1024**2) - cpu_limits = 0.5 - memory_limits = 512 * (1024**2) - elif namespace == "shishika01": - cpu_usage = 0.15 - memory_usage = 256 * (1024**2) # 256MB - cpu_requests = 0.2 - memory_requests = 256 * (1024**2) - cpu_limits = 1.0 - memory_limits = 1024 * (1024**2) - else: - cpu_usage = 0.08 - memory_usage = 192 * (1024**2) # 192MB - cpu_requests = 0.15 - memory_requests = 192 * (1024**2) - cpu_limits = 0.8 - memory_limits = 768 * (1024**2) + return { + "workload": workload, + "namespace": namespace, + "time_range": time_range, + "prometheus_available": False, + "data_source": "no_data", + "message": "No metrics data available for this workload", + "cluster_total": { + "cpu_cores": 0, + "memory_bytes": 0, + "memory_gb": 0 + }, + "workload_metrics": { + "cpu": { + "usage_cores": 0, + "usage_percent": 0, + "requests_cores": 0, + "requests_percent": 0, + "limits_cores": 0, + "limits_percent": 0 + }, + "memory": { + "usage_bytes": 0, + "usage_mb": 0, + "usage_percent": 0, + "requests_bytes": 0, + "requests_mb": 0, + "requests_percent": 0, + "limits_bytes": 0, + "limits_mb": 0, + "limits_percent": 0 + } + } + } # Calculate percentages cpu_usage_percent = (cpu_usage / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0 @@ -632,8 +628,8 @@ async def get_workload_historical_metrics( "workload": workload, "namespace": namespace, "time_range": time_range, - "prometheus_available": prometheus_available, - "data_source": "simulated" if not prometheus_available else "prometheus", + "prometheus_available": True, + "data_source": "prometheus", "cluster_total": { "cpu_cores": cluster_cpu_total, "memory_bytes": cluster_memory_total, diff --git a/app/static/index.html b/app/static/index.html index 97c9ea1..f26429f 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -1566,9 +1566,9 @@ const workloadDataDiv = document.getElementById('workloadData'); // Add data source indicator - const dataSourceIndicator = data.data_source === 'simulated' ? - '