Implement workload-based historical analysis with timeline buttons

This commit is contained in:
2025-09-26 13:50:44 -03:00
parent 85446e673e
commit 0a5b8a03c6
3 changed files with 454 additions and 32 deletions

View File

@@ -55,16 +55,74 @@ async def get_cluster_status(
# Get VPA recommendations # Get VPA recommendations
vpa_recommendations = await k8s_client.get_vpa_recommendations() vpa_recommendations = await k8s_client.get_vpa_recommendations()
# Generate report # Group pods by namespace for the frontend
report = report_service.generate_cluster_report( namespaces_data = {}
pods=pods, for pod in pods:
validations=all_validations, namespace = pod.namespace
vpa_recommendations=vpa_recommendations, if namespace not in namespaces_data:
overcommit_info=overcommit_info, namespaces_data[namespace] = {
nodes_info=nodes_info 'namespace': namespace,
) 'pods': {},
'total_validations': 0,
'severity_breakdown': {'error': 0, 'warning': 0, 'info': 0}
}
# Add pod to namespace
pod_name = pod.name
pod_validations = validation_service.validate_pod_resources(pod)
# Convert pod to the format expected by frontend
pod_data = {
'pod_name': pod_name,
'namespace': namespace,
'phase': pod.phase,
'node_name': pod.node_name,
'containers': [],
'validations': []
}
# Add containers
for container in pod.containers:
container_data = {
'name': container['name'],
'image': container['image'],
'resources': container['resources']
}
pod_data['containers'].append(container_data)
# Add validations for this pod
for validation in pod_validations:
validation_data = {
'rule_name': validation.validation_type,
'namespace': namespace,
'message': validation.message,
'recommendation': validation.recommendation,
'severity': validation.severity
}
pod_data['validations'].append(validation_data)
# Update namespace severity breakdown
namespaces_data[namespace]['severity_breakdown'][validation.severity] += 1
namespaces_data[namespace]['total_validations'] += 1
namespaces_data[namespace]['pods'][pod_name] = pod_data
return report # Convert to list format expected by frontend
namespaces_list = list(namespaces_data.values())
# Count total errors and warnings
total_errors = sum(ns['severity_breakdown']['error'] for ns in namespaces_list)
total_warnings = sum(ns['severity_breakdown']['warning'] for ns in namespaces_list)
return {
"timestamp": datetime.now().isoformat(),
"total_pods": len(pods),
"total_namespaces": len(namespaces_list),
"total_nodes": len(nodes_info) if nodes_info else 0,
"total_errors": total_errors,
"total_warnings": total_warnings,
"namespaces": namespaces_list
}
except Exception as e: except Exception as e:
logger.error(f"Error getting cluster status: {e}") logger.error(f"Error getting cluster status: {e}")
@@ -449,6 +507,34 @@ async def get_namespace_historical_analysis(
logger.error(f"Error getting historical analysis for namespace {namespace}: {e}") logger.error(f"Error getting historical analysis for namespace {namespace}: {e}")
raise HTTPException(status_code=500, detail=str(e)) raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/namespace/{namespace}/workload/{workload}/historical-analysis")
async def get_workload_historical_analysis(
namespace: str,
workload: str,
time_range: str = "24h",
prometheus_client=Depends(get_prometheus_client)
):
"""Get historical analysis for a specific workload/deployment"""
try:
historical_service = HistoricalAnalysisService()
# Get historical analysis for the workload
analysis = await historical_service.get_workload_historical_analysis(
namespace, workload, time_range, prometheus_client
)
return {
"namespace": namespace,
"workload": workload,
"time_range": time_range,
"analysis": analysis,
"timestamp": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Error getting historical analysis for workload {workload} in namespace {namespace}: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/namespace/{namespace}/pod/{pod_name}/historical-analysis") @api_router.get("/namespace/{namespace}/pod/{pod_name}/historical-analysis")
async def get_pod_historical_analysis( async def get_pod_historical_analysis(
namespace: str, namespace: str,
@@ -456,7 +542,7 @@ async def get_pod_historical_analysis(
time_range: str = "24h", time_range: str = "24h",
prometheus_client=Depends(get_prometheus_client) prometheus_client=Depends(get_prometheus_client)
): ):
"""Get historical analysis for a specific pod""" """Get historical analysis for a specific pod (legacy endpoint)"""
try: try:
historical_service = HistoricalAnalysisService() historical_service = HistoricalAnalysisService()

View File

@@ -570,54 +570,224 @@ class HistoricalAnalysisService:
'recommendations': [] 'recommendations': []
} }
async def get_workload_historical_analysis(self, namespace: str, workload: str, time_range: str, prometheus_client):
"""Get historical analysis for a specific workload/deployment"""
try:
logger.info(f"Getting historical analysis for workload: {workload} in namespace: {namespace}")
# Query for CPU usage by workload (aggregated by workload)
cpu_query = f'''
sum(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{{
cluster="",
namespace="{namespace}"
}}
* on(namespace,pod)
group_left(workload, workload_type)
namespace_workload_pod:kube_pod_owner:relabel{{
cluster="",
namespace="{namespace}",
workload="{workload}",
workload_type=~".+"
}}
) by (workload, workload_type)
'''
# Query for memory usage by workload (aggregated by workload)
memory_query = f'''
sum(
container_memory_working_set_bytes{{
namespace="{namespace}",
container!="POD",
container!=""
}}
* on(namespace,pod)
group_left(workload, workload_type)
namespace_workload_pod:kube_pod_owner:relabel{{
cluster="",
namespace="{namespace}",
workload="{workload}",
workload_type=~".+"
}}
) by (workload, workload_type)
'''
# Query for CPU requests by namespace (using resource quota)
cpu_requests_query = f'''
scalar(kube_resourcequota{{
cluster="",
namespace="{namespace}",
type="hard",
resource="requests.cpu"
}})
'''
# Query for memory requests by namespace (using resource quota)
memory_requests_query = f'''
scalar(kube_resourcequota{{
cluster="",
namespace="{namespace}",
type="hard",
resource="requests.memory"
}})
'''
# Query for CPU limits by namespace (using resource quota)
cpu_limits_query = f'''
scalar(kube_resourcequota{{
cluster="",
namespace="{namespace}",
type="hard",
resource="limits.cpu"
}})
'''
# Query for memory limits by namespace (using resource quota)
memory_limits_query = f'''
scalar(kube_resourcequota{{
cluster="",
namespace="{namespace}",
type="hard",
resource="limits.memory"
}})
'''
# Execute queries
cpu_usage = await self._query_prometheus(cpu_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
memory_usage = await self._query_prometheus(memory_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
cpu_requests = await self._query_prometheus(cpu_requests_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
memory_requests = await self._query_prometheus(memory_requests_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
cpu_limits = await self._query_prometheus(cpu_limits_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
memory_limits = await self._query_prometheus(memory_limits_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now())
# Calculate utilization percentages
cpu_utilization = 0
memory_utilization = 0
if cpu_usage and cpu_requests and cpu_requests[0][1] != '0':
cpu_utilization = (float(cpu_usage[0][1]) / float(cpu_requests[0][1])) * 100
if memory_usage and memory_requests and memory_requests[0][1] != '0':
memory_utilization = (float(memory_usage[0][1]) / float(memory_requests[0][1])) * 100
# Generate recommendations based on utilization
recommendations = []
if cpu_utilization > 80:
recommendations.append({
"type": "cpu_high_utilization",
"severity": "warning",
"message": f"High CPU utilization: {cpu_utilization:.1f}%",
"recommendation": "Consider increasing CPU requests or optimizing application performance"
})
elif cpu_utilization < 20 and cpu_utilization > 0:
recommendations.append({
"type": "cpu_low_utilization",
"severity": "info",
"message": f"Low CPU utilization: {cpu_utilization:.1f}%",
"recommendation": "Consider reducing CPU requests to optimize resource allocation"
})
if memory_utilization > 80:
recommendations.append({
"type": "memory_high_utilization",
"severity": "warning",
"message": f"High memory utilization: {memory_utilization:.1f}%",
"recommendation": "Consider increasing memory requests or optimizing memory usage"
})
elif memory_utilization < 20 and memory_utilization > 0:
recommendations.append({
"type": "memory_low_utilization",
"severity": "info",
"message": f"Low memory utilization: {memory_utilization:.1f}%",
"recommendation": "Consider reducing memory requests to optimize resource allocation"
})
return {
'namespace': namespace,
'workload': workload,
'time_range': time_range,
'cpu_usage': float(cpu_usage[0][1]) if cpu_usage else 0,
'memory_usage': float(memory_usage[0][1]) if memory_usage else 0,
'cpu_requests': float(cpu_requests[0][1]) if cpu_requests else 0,
'memory_requests': float(memory_requests[0][1]) if memory_requests else 0,
'cpu_limits': float(cpu_limits[0][1]) if cpu_limits else 0,
'memory_limits': float(memory_limits[0][1]) if memory_limits else 0,
'cpu_utilization': cpu_utilization,
'memory_utilization': memory_utilization,
'recommendations': recommendations
}
except Exception as e:
logger.error(f"Error getting historical analysis for workload {workload} in namespace {namespace}: {e}")
return {
'namespace': namespace,
'workload': workload,
'time_range': time_range,
'error': str(e),
'recommendations': []
}
async def get_pod_historical_analysis(self, namespace: str, pod_name: str, time_range: str, prometheus_client): async def get_pod_historical_analysis(self, namespace: str, pod_name: str, time_range: str, prometheus_client):
"""Get historical analysis for a specific pod""" """Get historical analysis for a specific pod"""
try: try:
logger.info(f"Getting historical analysis for pod: {pod_name} in namespace: {namespace}") logger.info(f"Getting historical analysis for pod: {pod_name} in namespace: {namespace}")
# Query for CPU usage by pod # Query for CPU usage by pod (more generic query)
cpu_query = f''' cpu_query = f'''
sum(rate(container_cpu_usage_seconds_total{{ sum(rate(container_cpu_usage_seconds_total{{
namespace="{namespace}", namespace="{namespace}",
pod="{pod_name}", pod=~"{pod_name}.*",
container!="POD", container!="POD",
container!="" container!=""
}}[{time_range}])) }}[{time_range}]))
''' '''
# Query for memory usage by pod # Query for memory usage by pod (more generic query)
memory_query = f''' memory_query = f'''
sum(container_memory_working_set_bytes{{ sum(container_memory_working_set_bytes{{
namespace="{namespace}", namespace="{namespace}",
pod="{pod_name}", pod=~"{pod_name}.*",
container!="POD", container!="POD",
container!="" container!=""
}}) }})
''' '''
# Query for CPU requests by pod # Query for CPU requests by pod (more generic query)
cpu_requests_query = f''' cpu_requests_query = f'''
sum(kube_pod_container_resource_requests{{ sum(kube_pod_container_resource_requests{{
namespace="{namespace}", namespace="{namespace}",
pod="{pod_name}", pod=~"{pod_name}.*",
resource="cpu" resource="cpu"
}}) }})
''' '''
# Query for memory requests by pod # Query for memory requests by pod (more generic query)
memory_requests_query = f''' memory_requests_query = f'''
sum(kube_pod_container_resource_requests{{ sum(kube_pod_container_resource_requests{{
namespace="{namespace}", namespace="{namespace}",
pod="{pod_name}", pod=~"{pod_name}.*",
resource="memory" resource="memory"
}}) }})
''' '''
# Query for container count by pod # Query for container count by pod (more generic query)
container_count_query = f''' container_count_query = f'''
count(container_memory_working_set_bytes{{ count(container_memory_working_set_bytes{{
namespace="{namespace}", namespace="{namespace}",
pod="{pod_name}", pod=~"{pod_name}.*",
container!="POD", container!="POD",
container!="" container!=""
}}) }})
@@ -626,19 +796,19 @@ class HistoricalAnalysisService:
# Execute queries # Execute queries
cpu_usage = await self._query_prometheus(cpu_query, cpu_usage = await self._query_prometheus(cpu_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]), datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now(), prometheus_client) datetime.now())
memory_usage = await self._query_prometheus(memory_query, memory_usage = await self._query_prometheus(memory_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]), datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now(), prometheus_client) datetime.now())
cpu_requests = await self._query_prometheus(cpu_requests_query, cpu_requests = await self._query_prometheus(cpu_requests_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]), datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now(), prometheus_client) datetime.now())
memory_requests = await self._query_prometheus(memory_requests_query, memory_requests = await self._query_prometheus(memory_requests_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]), datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now(), prometheus_client) datetime.now())
container_count = await self._query_prometheus(container_count_query, container_count = await self._query_prometheus(container_count_query,
datetime.now() - timedelta(seconds=self.time_ranges[time_range]), datetime.now() - timedelta(seconds=self.time_ranges[time_range]),
datetime.now(), prometheus_client) datetime.now())
# Calculate utilization percentages # Calculate utilization percentages
cpu_utilization = 0 cpu_utilization = 0

View File

@@ -153,6 +153,34 @@
margin-top: 1.5rem; margin-top: 1.5rem;
} }
.time-range-selector {
display: flex;
gap: 0.5rem;
margin-bottom: 1rem;
padding: 1rem;
background: #f8f9fa;
border-radius: 0.5rem;
}
.time-range-btn {
padding: 0.5rem 1rem;
border: 1px solid #ddd;
background: white;
border-radius: 0.25rem;
cursor: pointer;
transition: all 0.2s;
}
.time-range-btn:hover {
background: #e9ecef;
}
.time-range-btn.active {
background: #007bff;
color: white;
border-color: #007bff;
}
.page-header { .page-header {
padding: 0 1rem; padding: 0 1rem;
} }
@@ -938,14 +966,21 @@
</div> </div>
<!-- Historical Analysis Modal --> <!-- Historical Analysis Modal -->
<div class="modal hidden" id="historicalModal" style="display: none;"> <div class="modal hidden" id="historicalModal">
<div class="modal-content"> <div class="modal-content">
<div class="modal-header"> <div class="modal-header">
<h2 id="historicalModalTitle">Historical Analysis</h2> <h2 id="historicalModalTitle">Historical Analysis</h2>
<button class="modal-close" onclick="closeHistoricalModal()">&times;</button> <button class="modal-close" onclick="closeHistoricalModal()">&times;</button>
</div> </div>
<div class="modal-body" id="historicalModalBody"> <div class="modal-body" id="historicalModalBody">
<!-- Content will be loaded here --> <div class="time-range-selector">
<button class="time-range-btn active" data-range="24h">1 Day</button>
<button class="time-range-btn" data-range="7d">7 Days</button>
<button class="time-range-btn" data-range="30d">30 Days</button>
</div>
<div id="historicalAnalysisContent">
<p>Loading historical analysis...</p>
</div>
</div> </div>
</div> </div>
</div> </div>
@@ -1217,7 +1252,7 @@
<div class="pod-name">${pod.pod_name}</div> <div class="pod-name">${pod.pod_name}</div>
<div style="display: flex; align-items: center; gap: 0.5rem;"> <div style="display: flex; align-items: center; gap: 0.5rem;">
<div class="pod-validations-count">${pod.validations.length} validations</div> <div class="pod-validations-count">${pod.validations.length} validations</div>
<button class="btn btn-secondary" style="padding: 0.25rem 0.5rem; font-size: 0.8rem;" onclick="loadPodHistoricalAnalysis('${namespace.namespace}', '${pod.pod_name}')">Historical Analysis</button> <button class="btn btn-secondary" style="padding: 0.25rem 0.5rem; font-size: 0.8rem;" onclick="loadWorkloadHistoricalAnalysis('${namespace.namespace}', '${pod.pod_name.split('-').slice(0, -2).join('-')}')">Historical Analysis</button>
</div> </div>
</div> </div>
<div class="validation-list"> <div class="validation-list">
@@ -1518,11 +1553,38 @@
} }
// Historical Analysis Modal Functions // Historical Analysis Modal Functions
async function loadPodHistoricalAnalysis(namespace, podName) { let currentNamespace = null;
let currentWorkload = null;
async function loadWorkloadHistoricalAnalysis(namespace, workload, timeRange = '24h') {
showLoading(); showLoading();
currentNamespace = namespace;
currentWorkload = workload;
try {
const response = await fetch(`/api/v1/namespace/${namespace}/workload/${workload}/historical-analysis?time_range=${timeRange}`);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const data = await response.json();
displayWorkloadHistoricalAnalysis(data);
showHistoricalModal(`${namespace}/${workload}`);
} catch (error) {
showError('Error loading historical analysis: ' + error.message);
} finally {
hideLoading();
}
}
async function loadPodHistoricalAnalysis(namespace, podName, timeRange = '24h') {
showLoading();
currentNamespace = namespace;
currentWorkload = podName; // For backward compatibility
try { try {
const timeRange = '24h'; // Default time range
const response = await fetch(`/api/v1/namespace/${namespace}/pod/${podName}/historical-analysis?time_range=${timeRange}`); const response = await fetch(`/api/v1/namespace/${namespace}/pod/${podName}/historical-analysis?time_range=${timeRange}`);
if (!response.ok) { if (!response.ok) {
@@ -1562,6 +1624,88 @@
} }
} }
function displayWorkloadHistoricalAnalysis(data) {
const container = document.getElementById('historicalAnalysisContent');
if (data.analysis.error) {
container.innerHTML = `
<div class="error">
<h3>Error loading historical data</h3>
<p>${data.analysis.error}</p>
</div>
`;
return;
}
const analysis = data.analysis;
const recommendations = analysis.recommendations || [];
let html = `
<div class="historical-summary">
<h3>Workload: ${analysis.workload}</h3>
<p><strong>Namespace:</strong> ${analysis.namespace}</p>
<p><strong>Time Range:</strong> ${analysis.time_range}</p>
</div>
<div class="historical-stats">
<div class="historical-stat">
<h4>CPU Usage</h4>
<div class="value">${analysis.cpu_usage.toFixed(3)} cores</div>
</div>
<div class="historical-stat">
<h4>Memory Usage</h4>
<div class="value">${(analysis.memory_usage / (1024*1024*1024)).toFixed(2)} GiB</div>
</div>
<div class="historical-stat">
<h4>CPU Utilization</h4>
<div class="value" style="color: ${analysis.cpu_utilization > 80 ? '#dc3545' : analysis.cpu_utilization < 20 ? '#28a745' : '#007bff'}">${analysis.cpu_utilization.toFixed(1)}%</div>
</div>
<div class="historical-stat">
<h4>Memory Utilization</h4>
<div class="value" style="color: ${analysis.memory_utilization > 80 ? '#dc3545' : analysis.memory_utilization < 20 ? '#28a745' : '#007bff'}">${analysis.memory_utilization.toFixed(1)}%</div>
</div>
<div class="historical-stat">
<h4>CPU Requests</h4>
<div class="value">${analysis.cpu_requests.toFixed(3)} cores</div>
</div>
<div class="historical-stat">
<h4>Memory Requests</h4>
<div class="value">${(analysis.memory_requests / (1024*1024*1024)).toFixed(2)} GiB</div>
</div>
<div class="historical-stat">
<h4>CPU Limits</h4>
<div class="value">${analysis.cpu_limits.toFixed(3)} cores</div>
</div>
<div class="historical-stat">
<h4>Memory Limits</h4>
<div class="value">${(analysis.memory_limits / (1024*1024*1024)).toFixed(2)} GiB</div>
</div>
</div>
`;
if (recommendations.length > 0) {
html += `
<div class="historical-summary">
<h3>Recommendations</h3>
`;
recommendations.forEach(rec => {
const severityClass = rec.severity === 'error' ? 'error' : rec.severity === 'warning' ? 'warning' : 'info';
html += `
<div class="recommendation ${severityClass}">
<span class="badge ${severityClass}">${rec.severity}</span>
<strong>${rec.message}</strong>
<p>${rec.recommendation}</p>
</div>
`;
});
html += `</div>`;
}
container.innerHTML = html;
}
function displayPodHistoricalAnalysis(data) { function displayPodHistoricalAnalysis(data) {
const container = document.getElementById('historicalModalBody'); const container = document.getElementById('historicalModalBody');
@@ -1719,11 +1863,15 @@
function showHistoricalModal(namespace) { function showHistoricalModal(namespace) {
document.getElementById('historicalModalTitle').textContent = `Historical Analysis - ${namespace}`; document.getElementById('historicalModalTitle').textContent = `Historical Analysis - ${namespace}`;
document.getElementById('historicalModal').classList.add('show'); const modal = document.getElementById('historicalModal');
modal.classList.remove('hidden');
modal.classList.add('show');
} }
function closeHistoricalModal() { function closeHistoricalModal() {
document.getElementById('historicalModal').classList.remove('show'); const modal = document.getElementById('historicalModal');
modal.classList.remove('show');
modal.classList.add('hidden');
} }
// Close modal when clicking outside // Close modal when clicking outside
@@ -1733,6 +1881,24 @@
} }
}); });
// Time range button event listeners
document.addEventListener('click', function(e) {
if (e.target.classList.contains('time-range-btn')) {
const timeRange = e.target.getAttribute('data-range');
// Update active button
document.querySelectorAll('.time-range-btn').forEach(btn => {
btn.classList.remove('active');
});
e.target.classList.add('active');
// Reload data with new time range
if (currentNamespace && currentWorkload) {
loadWorkloadHistoricalAnalysis(currentNamespace, currentWorkload, timeRange);
}
}
});
// Export Modal Functions // Export Modal Functions
function showExportModal() { function showExportModal() {
document.getElementById('exportModal').classList.add('show'); document.getElementById('exportModal').classList.add('show');