Remove simulated data and enable real Prometheus metrics
This commit is contained in:
@@ -498,127 +498,123 @@ async def get_workload_historical_metrics(
|
||||
"""Get historical metrics for a specific workload with cluster percentages"""
|
||||
try:
|
||||
prometheus_client = PrometheusClient()
|
||||
await prometheus_client.initialize()
|
||||
|
||||
# Get current usage using OpenShift-specific metrics
|
||||
cpu_usage_query = f'sum(node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{{cluster="", namespace="{namespace}"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
memory_usage_query = f'sum(container_memory_working_set_bytes{{job="kubelet", metrics_path="/metrics/cadvisor", cluster="", namespace="{namespace}", container!="", image!=""}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
|
||||
cpu_usage_data = await prometheus_client.query(cpu_usage_query)
|
||||
memory_usage_data = await prometheus_client.query(memory_usage_query)
|
||||
|
||||
# Get resource requests and limits using OpenShift-specific metrics
|
||||
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="cpu"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
memory_requests_query = f'sum(kube_pod_container_resource_requests{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="memory"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
|
||||
cpu_requests_data = await prometheus_client.query(cpu_requests_query)
|
||||
memory_requests_data = await prometheus_client.query(memory_requests_query)
|
||||
|
||||
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="cpu"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
memory_limits_query = f'sum(kube_pod_container_resource_limits{{job="kube-state-metrics", cluster="", namespace="{namespace}", resource="memory"}} * on(namespace,pod) group_left(workload, workload_type) namespace_workload_pod:kube_pod_owner:relabel{{cluster="", namespace="{namespace}", workload_type=~".+"}}) by (workload, workload_type)'
|
||||
|
||||
cpu_limits_data = await prometheus_client.query(cpu_limits_query)
|
||||
memory_limits_data = await prometheus_client.query(memory_limits_query)
|
||||
|
||||
# Get cluster total resources
|
||||
# Get cluster total resources first
|
||||
cluster_cpu_query = 'sum(kube_node_status_allocatable{resource="cpu"})'
|
||||
cluster_memory_query = 'sum(kube_node_status_allocatable{resource="memory"})'
|
||||
|
||||
cluster_cpu_data = await prometheus_client.query(cluster_cpu_query)
|
||||
cluster_memory_data = await prometheus_client.query(cluster_memory_query)
|
||||
|
||||
# Extract values from OpenShift-specific queries
|
||||
# Extract cluster totals
|
||||
cluster_cpu_total = 0
|
||||
cluster_memory_total = 0
|
||||
|
||||
if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"):
|
||||
for result in cluster_cpu_data["data"]["result"]:
|
||||
cluster_cpu_total += float(result["value"][1])
|
||||
|
||||
if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"):
|
||||
for result in cluster_memory_data["data"]["result"]:
|
||||
cluster_memory_total += float(result["value"][1])
|
||||
|
||||
# Get workload-specific metrics using simpler queries
|
||||
# CPU usage for specific pod
|
||||
cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~".*{workload}.*"}}[5m])'
|
||||
memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~".*{workload}.*", container!="", image!=""}}'
|
||||
|
||||
# Resource requests and limits for specific pod
|
||||
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})'
|
||||
memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})'
|
||||
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="cpu"}})'
|
||||
memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~".*{workload}.*", resource="memory"}})'
|
||||
|
||||
# Execute queries
|
||||
cpu_usage_data = await prometheus_client.query(cpu_usage_query)
|
||||
memory_usage_data = await prometheus_client.query(memory_usage_query)
|
||||
cpu_requests_data = await prometheus_client.query(cpu_requests_query)
|
||||
memory_requests_data = await prometheus_client.query(memory_requests_query)
|
||||
cpu_limits_data = await prometheus_client.query(cpu_limits_query)
|
||||
memory_limits_data = await prometheus_client.query(memory_limits_query)
|
||||
|
||||
# Extract values
|
||||
cpu_usage = 0
|
||||
memory_usage = 0
|
||||
cpu_requests = 0
|
||||
memory_requests = 0
|
||||
cpu_limits = 0
|
||||
memory_limits = 0
|
||||
cluster_cpu_total = 0
|
||||
cluster_memory_total = 0
|
||||
|
||||
# Check if we got any data from Prometheus
|
||||
prometheus_available = False
|
||||
|
||||
# Extract CPU usage from workload-specific query
|
||||
# Extract CPU usage
|
||||
if cpu_usage_data.get("status") == "success" and cpu_usage_data.get("data", {}).get("result"):
|
||||
for result in cpu_usage_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
cpu_usage = float(result["value"][1])
|
||||
break
|
||||
cpu_usage += float(result["value"][1])
|
||||
|
||||
# Extract Memory usage from workload-specific query
|
||||
# Extract Memory usage
|
||||
if memory_usage_data.get("status") == "success" and memory_usage_data.get("data", {}).get("result"):
|
||||
for result in memory_usage_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
memory_usage = float(result["value"][1])
|
||||
break
|
||||
memory_usage += float(result["value"][1])
|
||||
|
||||
# Extract CPU requests from workload-specific query
|
||||
# Extract CPU requests
|
||||
if cpu_requests_data.get("status") == "success" and cpu_requests_data.get("data", {}).get("result"):
|
||||
for result in cpu_requests_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
cpu_requests = float(result["value"][1])
|
||||
break
|
||||
cpu_requests += float(result["value"][1])
|
||||
|
||||
# Extract Memory requests from workload-specific query
|
||||
# Extract Memory requests
|
||||
if memory_requests_data.get("status") == "success" and memory_requests_data.get("data", {}).get("result"):
|
||||
for result in memory_requests_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
memory_requests = float(result["value"][1])
|
||||
break
|
||||
memory_requests += float(result["value"][1])
|
||||
|
||||
# Extract CPU limits from workload-specific query
|
||||
# Extract CPU limits
|
||||
if cpu_limits_data.get("status") == "success" and cpu_limits_data.get("data", {}).get("result"):
|
||||
for result in cpu_limits_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
cpu_limits = float(result["value"][1])
|
||||
break
|
||||
cpu_limits += float(result["value"][1])
|
||||
|
||||
# Extract Memory limits from workload-specific query
|
||||
# Extract Memory limits
|
||||
if memory_limits_data.get("status") == "success" and memory_limits_data.get("data", {}).get("result"):
|
||||
for result in memory_limits_data["data"]["result"]:
|
||||
if result.get("metric", {}).get("workload") == workload:
|
||||
memory_limits = float(result["value"][1])
|
||||
break
|
||||
memory_limits += float(result["value"][1])
|
||||
|
||||
if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"):
|
||||
cluster_cpu_total = float(cluster_cpu_data["data"]["result"][0]["value"][1])
|
||||
# Check if we have real data
|
||||
prometheus_available = cluster_cpu_total > 0 and cluster_memory_total > 0
|
||||
|
||||
if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"):
|
||||
cluster_memory_total = float(cluster_memory_data["data"]["result"][0]["value"][1])
|
||||
|
||||
# Check if Prometheus is available (any non-zero values)
|
||||
if cluster_cpu_total > 0 or cluster_memory_total > 0:
|
||||
prometheus_available = True
|
||||
|
||||
# If Prometheus is not available, provide simulated data for demonstration
|
||||
# If no real data, return zeros with appropriate message
|
||||
if not prometheus_available:
|
||||
# Simulate cluster resources (typical OpenShift cluster)
|
||||
cluster_cpu_total = 24.0 # 6 nodes * 4 cores each
|
||||
cluster_memory_total = 96.0 * (1024**3) # 6 nodes * 16GB each
|
||||
|
||||
# Simulate workload metrics based on namespace
|
||||
if namespace == "resource-governance":
|
||||
cpu_usage = 0.05
|
||||
memory_usage = 128 * (1024**2) # 128MB
|
||||
cpu_requests = 0.1
|
||||
memory_requests = 128 * (1024**2)
|
||||
cpu_limits = 0.5
|
||||
memory_limits = 512 * (1024**2)
|
||||
elif namespace == "shishika01":
|
||||
cpu_usage = 0.15
|
||||
memory_usage = 256 * (1024**2) # 256MB
|
||||
cpu_requests = 0.2
|
||||
memory_requests = 256 * (1024**2)
|
||||
cpu_limits = 1.0
|
||||
memory_limits = 1024 * (1024**2)
|
||||
else:
|
||||
cpu_usage = 0.08
|
||||
memory_usage = 192 * (1024**2) # 192MB
|
||||
cpu_requests = 0.15
|
||||
memory_requests = 192 * (1024**2)
|
||||
cpu_limits = 0.8
|
||||
memory_limits = 768 * (1024**2)
|
||||
return {
|
||||
"workload": workload,
|
||||
"namespace": namespace,
|
||||
"time_range": time_range,
|
||||
"prometheus_available": False,
|
||||
"data_source": "no_data",
|
||||
"message": "No metrics data available for this workload",
|
||||
"cluster_total": {
|
||||
"cpu_cores": 0,
|
||||
"memory_bytes": 0,
|
||||
"memory_gb": 0
|
||||
},
|
||||
"workload_metrics": {
|
||||
"cpu": {
|
||||
"usage_cores": 0,
|
||||
"usage_percent": 0,
|
||||
"requests_cores": 0,
|
||||
"requests_percent": 0,
|
||||
"limits_cores": 0,
|
||||
"limits_percent": 0
|
||||
},
|
||||
"memory": {
|
||||
"usage_bytes": 0,
|
||||
"usage_mb": 0,
|
||||
"usage_percent": 0,
|
||||
"requests_bytes": 0,
|
||||
"requests_mb": 0,
|
||||
"requests_percent": 0,
|
||||
"limits_bytes": 0,
|
||||
"limits_mb": 0,
|
||||
"limits_percent": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Calculate percentages
|
||||
cpu_usage_percent = (cpu_usage / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
||||
@@ -632,8 +628,8 @@ async def get_workload_historical_metrics(
|
||||
"workload": workload,
|
||||
"namespace": namespace,
|
||||
"time_range": time_range,
|
||||
"prometheus_available": prometheus_available,
|
||||
"data_source": "simulated" if not prometheus_available else "prometheus",
|
||||
"prometheus_available": True,
|
||||
"data_source": "prometheus",
|
||||
"cluster_total": {
|
||||
"cpu_cores": cluster_cpu_total,
|
||||
"memory_bytes": cluster_memory_total,
|
||||
|
||||
Reference in New Issue
Block a user