Implement OpenShift Console exact queries for CPU and Memory Usage
- Add get_workload_cpu_summary() and get_workload_memory_summary() methods - Use exact OpenShift Console PromQL queries for data consistency - Update historical analysis API endpoints to include real CPU/Memory data - Document all OpenShift Console queries in AIAgents-Support.md - Fix CPU Usage and Memory Usage columns showing N/A in Historical Analysis
This commit is contained in:
@@ -1419,15 +1419,33 @@ async def get_historical_analysis(
|
|||||||
}
|
}
|
||||||
workloads[workload_name]['pods'].append(pod)
|
workloads[workload_name]['pods'].append(pod)
|
||||||
|
|
||||||
# Convert to list and add basic info
|
# Convert to list and add basic info with real CPU/Memory data
|
||||||
workload_list = []
|
workload_list = []
|
||||||
|
historical_service = HistoricalAnalysisService()
|
||||||
|
|
||||||
for workload_name, workload_data in workloads.items():
|
for workload_name, workload_data in workloads.items():
|
||||||
|
# Get current CPU and Memory usage using OpenShift Console queries
|
||||||
|
try:
|
||||||
|
cpu_usage = await historical_service.get_workload_cpu_summary(workload_data['namespace'], workload_name)
|
||||||
|
memory_usage = await historical_service.get_workload_memory_summary(workload_data['namespace'], workload_name)
|
||||||
|
|
||||||
|
# Format CPU usage (cores)
|
||||||
|
cpu_display = f"{cpu_usage:.3f} cores" if cpu_usage > 0 else "N/A"
|
||||||
|
|
||||||
|
# Format memory usage (MB)
|
||||||
|
memory_display = f"{memory_usage / (1024 * 1024):.1f} MB" if memory_usage > 0 else "N/A"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Error getting summary for {workload_name}: {e}")
|
||||||
|
cpu_display = "N/A"
|
||||||
|
memory_display = "N/A"
|
||||||
|
|
||||||
workload_list.append({
|
workload_list.append({
|
||||||
'name': workload_name,
|
'name': workload_name,
|
||||||
'namespace': workload_data['namespace'],
|
'namespace': workload_data['namespace'],
|
||||||
'pod_count': len(workload_data['pods']),
|
'pod_count': len(workload_data['pods']),
|
||||||
'cpu_usage': 'N/A', # Will be populated by Prometheus queries
|
'cpu_usage': cpu_display,
|
||||||
'memory_usage': 'N/A', # Will be populated by Prometheus queries
|
'memory_usage': memory_display,
|
||||||
'last_updated': datetime.now().isoformat()
|
'last_updated': datetime.now().isoformat()
|
||||||
})
|
})
|
||||||
|
|
||||||
@@ -1468,8 +1486,8 @@ async def get_workload_historical_details(
|
|||||||
cpu_data = await historical_service.get_cpu_usage_history(namespace, workload, time_range)
|
cpu_data = await historical_service.get_cpu_usage_history(namespace, workload, time_range)
|
||||||
memory_data = await historical_service.get_memory_usage_history(namespace, workload, time_range)
|
memory_data = await historical_service.get_memory_usage_history(namespace, workload, time_range)
|
||||||
|
|
||||||
# Generate recommendations
|
# Generate recommendations and get workload summary
|
||||||
recommendations = await historical_service.generate_recommendations(namespace, workload, time_range)
|
recommendations, workload_summary = await historical_service.generate_recommendations(namespace, workload, time_range)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"workload": workload,
|
"workload": workload,
|
||||||
@@ -1477,6 +1495,7 @@ async def get_workload_historical_details(
|
|||||||
"cpu_data": cpu_data,
|
"cpu_data": cpu_data,
|
||||||
"memory_data": memory_data,
|
"memory_data": memory_data,
|
||||||
"recommendations": recommendations,
|
"recommendations": recommendations,
|
||||||
|
"workload_summary": workload_summary,
|
||||||
"timestamp": datetime.now().isoformat()
|
"timestamp": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1448,6 +1448,82 @@ class HistoricalAnalysisService:
|
|||||||
"error": str(e)
|
"error": str(e)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async def get_workload_cpu_summary(self, namespace: str, workload: str) -> float:
|
||||||
|
"""Get current CPU usage summary for a workload using OpenShift Console query"""
|
||||||
|
try:
|
||||||
|
# Use exact OpenShift Console query for CPU usage per pod
|
||||||
|
cpu_query = f'''
|
||||||
|
sum(
|
||||||
|
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{{
|
||||||
|
cluster="",
|
||||||
|
namespace="{namespace}"
|
||||||
|
}}
|
||||||
|
* on(namespace,pod)
|
||||||
|
group_left(workload, workload_type)
|
||||||
|
namespace_workload_pod:kube_pod_owner:relabel{{
|
||||||
|
cluster="",
|
||||||
|
namespace="{namespace}",
|
||||||
|
workload="{workload}",
|
||||||
|
workload_type=~".+"
|
||||||
|
}}
|
||||||
|
) by (pod)
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Query Prometheus for current value
|
||||||
|
data = await self._query_prometheus(cpu_query,
|
||||||
|
datetime.utcnow() - timedelta(seconds=300), # Last 5 minutes
|
||||||
|
datetime.utcnow())
|
||||||
|
|
||||||
|
if data and len(data) > 0:
|
||||||
|
# Sum all pod values for the workload
|
||||||
|
total_cpu = sum(self._safe_float(point[1]) for point in data if point[1] != 'NaN')
|
||||||
|
return total_cpu
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting CPU summary for {workload}: {e}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
async def get_workload_memory_summary(self, namespace: str, workload: str) -> float:
|
||||||
|
"""Get current memory usage summary for a workload using OpenShift Console query"""
|
||||||
|
try:
|
||||||
|
# Use exact OpenShift Console query for memory usage per pod
|
||||||
|
memory_query = f'''
|
||||||
|
sum(
|
||||||
|
container_memory_working_set_bytes{{
|
||||||
|
cluster="",
|
||||||
|
namespace="{namespace}",
|
||||||
|
container!="",
|
||||||
|
image!=""
|
||||||
|
}}
|
||||||
|
* on(namespace,pod)
|
||||||
|
group_left(workload, workload_type)
|
||||||
|
namespace_workload_pod:kube_pod_owner:relabel{{
|
||||||
|
cluster="",
|
||||||
|
namespace="{namespace}",
|
||||||
|
workload="{workload}",
|
||||||
|
workload_type=~".+"
|
||||||
|
}}
|
||||||
|
) by (pod)
|
||||||
|
'''
|
||||||
|
|
||||||
|
# Query Prometheus for current value
|
||||||
|
data = await self._query_prometheus(memory_query,
|
||||||
|
datetime.utcnow() - timedelta(seconds=300), # Last 5 minutes
|
||||||
|
datetime.utcnow())
|
||||||
|
|
||||||
|
if data and len(data) > 0:
|
||||||
|
# Sum all pod values for the workload
|
||||||
|
total_memory = sum(self._safe_float(point[1]) for point in data if point[1] != 'NaN')
|
||||||
|
return total_memory
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting memory summary for {workload}: {e}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
async def generate_recommendations(self, namespace: str, workload: str, time_range: str = "24h") -> List[Dict[str, Any]]:
|
async def generate_recommendations(self, namespace: str, workload: str, time_range: str = "24h") -> List[Dict[str, Any]]:
|
||||||
"""Generate recommendations based on historical data"""
|
"""Generate recommendations based on historical data"""
|
||||||
try:
|
try:
|
||||||
@@ -1455,6 +1531,10 @@ class HistoricalAnalysisService:
|
|||||||
cpu_data = await self.get_cpu_usage_history(namespace, workload, time_range)
|
cpu_data = await self.get_cpu_usage_history(namespace, workload, time_range)
|
||||||
memory_data = await self.get_memory_usage_history(namespace, workload, time_range)
|
memory_data = await self.get_memory_usage_history(namespace, workload, time_range)
|
||||||
|
|
||||||
|
# Get current summary values for the workload
|
||||||
|
current_cpu_usage = await self.get_workload_cpu_summary(namespace, workload)
|
||||||
|
current_memory_usage = await self.get_workload_memory_summary(namespace, workload)
|
||||||
|
|
||||||
recommendations = []
|
recommendations = []
|
||||||
|
|
||||||
# Analyze CPU data
|
# Analyze CPU data
|
||||||
@@ -1505,7 +1585,16 @@ class HistoricalAnalysisService:
|
|||||||
"recommendation": "Increase memory limits to handle peak usage"
|
"recommendation": "Increase memory limits to handle peak usage"
|
||||||
})
|
})
|
||||||
|
|
||||||
return recommendations
|
# Add workload summary data to recommendations
|
||||||
|
workload_summary = {
|
||||||
|
"workload": workload,
|
||||||
|
"namespace": namespace,
|
||||||
|
"cpu_usage": current_cpu_usage,
|
||||||
|
"memory_usage": current_memory_usage / (1024 * 1024), # Convert bytes to MB
|
||||||
|
"time_range": time_range
|
||||||
|
}
|
||||||
|
|
||||||
|
return recommendations, workload_summary
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error generating recommendations: {str(e)}")
|
logger.error(f"Error generating recommendations: {str(e)}")
|
||||||
@@ -1514,4 +1603,4 @@ class HistoricalAnalysisService:
|
|||||||
"severity": "error",
|
"severity": "error",
|
||||||
"message": f"Error generating recommendations: {str(e)}",
|
"message": f"Error generating recommendations: {str(e)}",
|
||||||
"recommendation": "Check Prometheus connectivity and workload configuration"
|
"recommendation": "Check Prometheus connectivity and workload configuration"
|
||||||
}]
|
}], None
|
||||||
|
|||||||
Reference in New Issue
Block a user