Fix: Historical analysis now shows real consumption numbers and percentages relative to cluster totals

2025-09-30 18:03:17 -03:00
parent 5c5643576f
commit 2b2b3c23b2
2 changed files with 264 additions and 150 deletions
--- a/app/api/routes.py
+++ b/app/api/routes.py
@@ -495,53 +495,108 @@ async def get_workload_historical_metrics(
    workload: str,
    time_range: str = "24h"
 ):
-    """Get historical metrics for a specific workload (deployment/daemonset)"""
+    """Get historical metrics for a specific workload with cluster percentages"""
    try:
        prometheus_client = PrometheusClient()
-        # Get CPU and Memory usage metrics for the workload
+        # Get current usage (latest values)
-        cpu_usage = await prometheus_client.query_range(
+        cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{workload}-.*"}}[5m])'
-            f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}",pod=~"{workload}-.*"}}[5m])',
+        memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}",pod=~"{workload}-.*"}}'
            time_range
        )
-        memory_usage = await prometheus_client.query_range(
+        cpu_usage_data = await prometheus_client.query(cpu_usage_query)
-            f'container_memory_working_set_bytes{{namespace="{namespace}",pod=~"{workload}-.*"}}',
+        memory_usage_data = await prometheus_client.query(memory_usage_query)
            time_range
        )
        # Get resource requests and limits
-        cpu_requests = await prometheus_client.query_range(
+        cpu_requests_query = f'kube_pod_container_resource_requests{{namespace="{namespace}",pod=~"{workload}-.*",resource="cpu"}}'
-            f'kube_pod_container_resource_requests{{namespace="{namespace}",pod=~"{workload}-.*",resource="cpu"}}',
+        memory_requests_query = f'kube_pod_container_resource_requests{{namespace="{namespace}",pod=~"{workload}-.*",resource="memory"}}'
            time_range
        )
-        memory_requests = await prometheus_client.query_range(
+        cpu_requests_data = await prometheus_client.query(cpu_requests_query)
-            f'kube_pod_container_resource_requests{{namespace="{namespace}",pod=~"{workload}-.*",resource="memory"}}',
+        memory_requests_data = await prometheus_client.query(memory_requests_query)
            time_range
        )
-        cpu_limits = await prometheus_client.query_range(
+        cpu_limits_query = f'kube_pod_container_resource_limits{{namespace="{namespace}",pod=~"{workload}-.*",resource="cpu"}}'
-            f'kube_pod_container_resource_limits{{namespace="{namespace}",pod=~"{workload}-.*",resource="cpu"}}',
+        memory_limits_query = f'kube_pod_container_resource_limits{{namespace="{namespace}",pod=~"{workload}-.*",resource="memory"}}'
            time_range
        )
-        memory_limits = await prometheus_client.query_range(
+        cpu_limits_data = await prometheus_client.query(cpu_limits_query)
-            f'kube_pod_container_resource_limits{{namespace="{namespace}",pod=~"{workload}-.*",resource="memory"}}',
+        memory_limits_data = await prometheus_client.query(memory_limits_query)
-            time_range
+        
-        )
+        # Get cluster total resources
        cluster_cpu_query = 'sum(kube_node_status_allocatable{resource="cpu"})'
        cluster_memory_query = 'sum(kube_node_status_allocatable{resource="memory"})'
        cluster_cpu_data = await prometheus_client.query(cluster_cpu_query)
        cluster_memory_data = await prometheus_client.query(cluster_memory_query)
        # Extract values
        cpu_usage = 0
        memory_usage = 0
        cpu_requests = 0
        memory_requests = 0
        cpu_limits = 0
        memory_limits = 0
        cluster_cpu_total = 0
        cluster_memory_total = 0
        if cpu_usage_data.get("status") == "success" and cpu_usage_data.get("data", {}).get("result"):
            cpu_usage = float(cpu_usage_data["data"]["result"][0]["value"][1])
        if memory_usage_data.get("status") == "success" and memory_usage_data.get("data", {}).get("result"):
            memory_usage = float(memory_usage_data["data"]["result"][0]["value"][1])
        if cpu_requests_data.get("status") == "success" and cpu_requests_data.get("data", {}).get("result"):
            cpu_requests = float(cpu_requests_data["data"]["result"][0]["value"][1])
        if memory_requests_data.get("status") == "success" and memory_requests_data.get("data", {}).get("result"):
            memory_requests = float(memory_requests_data["data"]["result"][0]["value"][1])
        if cpu_limits_data.get("status") == "success" and cpu_limits_data.get("data", {}).get("result"):
            cpu_limits = float(cpu_limits_data["data"]["result"][0]["value"][1])
        if memory_limits_data.get("status") == "success" and memory_limits_data.get("data", {}).get("result"):
            memory_limits = float(memory_limits_data["data"]["result"][0]["value"][1])
        if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"):
            cluster_cpu_total = float(cluster_cpu_data["data"]["result"][0]["value"][1])
        if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"):
            cluster_memory_total = float(cluster_memory_data["data"]["result"][0]["value"][1])
        # Calculate percentages
        cpu_usage_percent = (cpu_usage / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
        memory_usage_percent = (memory_usage / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
        cpu_requests_percent = (cpu_requests / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
        memory_requests_percent = (memory_requests / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
        cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
        memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
        return {
            "workload": workload,
            "namespace": namespace,
            "time_range": time_range,
-            "metrics": {
+            "cluster_total": {
-                "cpu_usage": cpu_usage,
+                "cpu_cores": cluster_cpu_total,
-                "memory_usage": memory_usage,
+                "memory_bytes": cluster_memory_total,
-                "cpu_requests": cpu_requests,
+                "memory_gb": cluster_memory_total / (1024**3)
-                "memory_requests": memory_requests,
+            },
-                "cpu_limits": cpu_limits,
+            "workload_metrics": {
-                "memory_limits": memory_limits
+                "cpu": {
                    "usage_cores": cpu_usage,
                    "usage_percent": round(cpu_usage_percent, 2),
                    "requests_cores": cpu_requests,
                    "requests_percent": round(cpu_requests_percent, 2),
                    "limits_cores": cpu_limits,
                    "limits_percent": round(cpu_limits_percent, 2)
                },
                "memory": {
                    "usage_bytes": memory_usage,
                    "usage_mb": round(memory_usage / (1024**2), 2),
                    "usage_percent": round(memory_usage_percent, 2),
                    "requests_bytes": memory_requests,
                    "requests_mb": round(memory_requests / (1024**2), 2),
                    "requests_percent": round(memory_requests_percent, 2),
                    "limits_bytes": memory_limits,
                    "limits_mb": round(memory_limits / (1024**2), 2),
                    "limits_percent": round(memory_limits_percent, 2)
                }
            }
        }
    except Exception as e:
--- a/app/static/index.html
+++ b/app/static/index.html
@@ -411,6 +411,95 @@
            border: 1px solid #eee;
            border-radius: 4px;
        }
        .cluster-stats {
            display: flex;
            gap: 20px;
            margin-bottom: 30px;
        }
        .stat-card {
            background: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 20px;
            text-align: center;
            flex: 1;
        }
        .stat-card h4 {
            margin: 0 0 10px 0;
            color: #495057;
            font-size: 14px;
            text-transform: uppercase;
            letter-spacing: 0.5px;
        }
        .stat-value {
            font-size: 24px;
            font-weight: bold;
            color: #007bff;
        }
        .metrics-grid {
            display: grid;
            grid-template-columns: 1fr 1fr;
            gap: 30px;
        }
        .metric-section {
            background: white;
            border: 1px solid #dee2e6;
            border-radius: 8px;
            padding: 20px;
        }
        .metric-section h4 {
            margin: 0 0 20px 0;
            color: #495057;
            border-bottom: 2px solid #e9ecef;
            padding-bottom: 10px;
        }
        .metric-row {
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 8px 0;
            border-bottom: 1px solid #f8f9fa;
        }
        .metric-row:last-child {
            border-bottom: none;
        }
        .metric-label {
            font-weight: 500;
            color: #6c757d;
            min-width: 120px;
        }
        .metric-value {
            font-weight: bold;
            color: #212529;
            font-family: 'Courier New', monospace;
        }
        .metric-percent {
            color: #6c757d;
            font-size: 12px;
            font-style: italic;
        }
        @media (max-width: 768px) {
            .metrics-grid {
                grid-template-columns: 1fr;
            }
            .cluster-stats {
                flex-direction: column;
            }
        }
        /* Problem Summary Table */
        .problem-summary {
@@ -1369,9 +1458,9 @@
                modal.id = 'historicalModal';
                modal.className = 'modal';
                modal.innerHTML = `
-                    <div class="modal-content" style="width: 90%; max-width: 1200px;">
+                    <div class="modal-content" style="width: 90%; max-width: 1000px;">
                        <div class="modal-header">
-                            <h2>📈 Historical Analysis - Real Prometheus Metrics</h2>
+                            <h2>📊 Resource Consumption Analysis - Real Numbers</h2>
                            <span class="close">&times;</span>
                        </div>
                        <div class="modal-body" id="historicalModalBody">
@@ -1387,14 +1476,14 @@
                                    <option value="7d">Last 7 days</option>
                                </select>
                            </div>
-                            <div id="metricsCharts" style="display: none;">
+                            <div id="metricsData" style="display: none;">
-                                <div class="chart-container">
+                                <div class="cluster-info">
-                                    <h3>CPU Usage vs Requests/Limits</h3>
+                                    <h3>🏢 Cluster Total Resources</h3>
-                                    <canvas id="cpuChart" width="800" height="300"></canvas>
+                                    <div id="clusterTotal"></div>
                                </div>
-                                <div class="chart-container">
+                                <div class="workload-metrics">
-                                    <h3>Memory Usage vs Requests/Limits</h3>
+                                    <h3>📈 Workload Resource Consumption</h3>
-                                    <canvas id="memoryChart" width="800" height="300"></canvas>
+                                    <div id="workloadData"></div>
                                </div>
                            </div>
                        </div>
@@ -1435,10 +1524,10 @@
        async function loadWorkloadMetrics() {
            const workloadSelect = document.getElementById('workloadSelect');
            const timeRangeSelect = document.getElementById('timeRangeSelect');
-            const chartsDiv = document.getElementById('metricsCharts');
+            const metricsDiv = document.getElementById('metricsData');
            if (!workloadSelect.value) {
-                chartsDiv.style.display = 'none';
+                metricsDiv.style.display = 'none';
                return;
            }
@@ -1446,128 +1535,98 @@
            const timeRange = timeRangeSelect.value;
            try {
-                chartsDiv.style.display = 'block';
+                metricsDiv.style.display = 'block';
-                chartsDiv.innerHTML = '<p>Loading metrics from Prometheus...</p>';
+                metricsDiv.innerHTML = '<p>Loading metrics from Prometheus...</p>';
                const response = await fetch(`/api/v1/workloads/${namespace}/${workload}/metrics?time_range=${timeRange}`);
                const data = await response.json();
-                if (data.metrics) {
+                if (data.workload_metrics) {
-                    renderMetricsCharts(data.metrics, timeRange);
+                    renderMetricsData(data);
                } else {
-                    chartsDiv.innerHTML = '<p>No metrics data available for this workload.</p>';
+                    metricsDiv.innerHTML = '<p>No metrics data available for this workload.</p>';
                }
            } catch (error) {
                console.error('Error loading metrics:', error);
-                chartsDiv.innerHTML = '<p>Error loading metrics. Please try again.</p>';
+                metricsDiv.innerHTML = '<p>Error loading metrics. Please try again.</p>';
            }
        }
-        function renderMetricsCharts(metrics, timeRange) {
+        function renderMetricsData(data) {
-            const chartsDiv = document.getElementById('metricsCharts');
+            const clusterTotalDiv = document.getElementById('clusterTotal');
-            chartsDiv.innerHTML = `
+            const workloadDataDiv = document.getElementById('workloadData');
-                <div class="chart-container">
+            
-                    <h3>CPU Usage vs Requests/Limits (${timeRange})</h3>
+            // Render cluster total resources
-                    <canvas id="cpuChart" width="800" height="300"></canvas>
+            clusterTotalDiv.innerHTML = `
-                </div>
+                <div class="cluster-stats">
-                <div class="chart-container">
+                    <div class="stat-card">
-                    <h3>Memory Usage vs Requests/Limits (${timeRange})</h3>
+                        <h4>CPU Total</h4>
-                    <canvas id="memoryChart" width="800" height="300"></canvas>
+                        <div class="stat-value">${data.cluster_total.cpu_cores.toFixed(2)} cores</div>
                    </div>
                    <div class="stat-card">
                        <h4>Memory Total</h4>
                        <div class="stat-value">${data.cluster_total.memory_gb.toFixed(2)} GB</div>
                    </div>
                </div>
            `;
-            // Simple chart rendering (you can replace with Chart.js or similar)
+            // Render workload metrics
-            renderSimpleChart('cpuChart', metrics.cpu_usage, metrics.cpu_requests, metrics.cpu_limits, 'CPU (cores)');
+            const cpu = data.workload_metrics.cpu;
-            renderSimpleChart('memoryChart', metrics.memory_usage, metrics.memory_requests, metrics.memory_limits, 'Memory (bytes)');
+            const memory = data.workload_metrics.memory;
            workloadDataDiv.innerHTML = `
                <div class="metrics-grid">
                    <div class="metric-section">
                        <h4>🖥️ CPU Resources</h4>
                        <div class="metric-row">
                            <span class="metric-label">Current Usage:</span>
                            <span class="metric-value">${cpu.usage_cores.toFixed(3)} cores</span>
                            <span class="metric-percent">(${cpu.usage_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Requests:</span>
                            <span class="metric-value">${cpu.requests_cores.toFixed(3)} cores</span>
                            <span class="metric-percent">(${cpu.requests_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Limits:</span>
                            <span class="metric-value">${cpu.limits_cores.toFixed(3)} cores</span>
                            <span class="metric-percent">(${cpu.limits_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Efficiency:</span>
                            <span class="metric-value ${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}">${cpu.usage_cores > 0 ? (cpu.usage_cores / cpu.requests_cores * 100).toFixed(1) + '%' : 'N/A'}</span>
                            <span class="metric-percent">(usage vs requests)</span>
                        </div>
                    </div>
                    <div class="metric-section">
                        <h4>💾 Memory Resources</h4>
                        <div class="metric-row">
                            <span class="metric-label">Current Usage:</span>
                            <span class="metric-value">${memory.usage_mb.toFixed(2)} MB</span>
                            <span class="metric-percent">(${memory.usage_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Requests:</span>
                            <span class="metric-value">${memory.requests_mb.toFixed(2)} MB</span>
                            <span class="metric-percent">(${memory.requests_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Limits:</span>
                            <span class="metric-value">${memory.limits_mb.toFixed(2)} MB</span>
                            <span class="metric-percent">(${memory.limits_percent}% of cluster)</span>
                        </div>
                        <div class="metric-row">
                            <span class="metric-label">Efficiency:</span>
                            <span class="metric-value ${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}">${memory.usage_bytes > 0 ? (memory.usage_bytes / memory.requests_bytes * 100).toFixed(1) + '%' : 'N/A'}</span>
                            <span class="metric-percent">(usage vs requests)</span>
                        </div>
                    </div>
                </div>
            `;
        }
        function renderSimpleChart(canvasId, usage, requests, limits, unit) {
            const canvas = document.getElementById(canvasId);
            const ctx = canvas.getContext('2d');
            const width = canvas.width;
            const height = canvas.height;
            // Clear canvas
            ctx.clearRect(0, 0, width, height);
            // Draw axes
            ctx.strokeStyle = '#333';
            ctx.lineWidth = 2;
            ctx.beginPath();
            ctx.moveTo(50, height - 50);
            ctx.lineTo(width - 50, height - 50);
            ctx.moveTo(50, 50);
            ctx.lineTo(50, height - 50);
            ctx.stroke();
            // Draw usage line
            if (usage && usage.length > 0) {
                ctx.strokeStyle = '#007bff';
                ctx.lineWidth = 2;
                ctx.beginPath();
                usage.forEach((point, index) => {
                    const x = 50 + (index * (width - 100) / usage.length);
                    const y = height - 50 - (point[1] * (height - 100) / Math.max(...usage.map(p => p[1])));
                    if (index === 0) {
                        ctx.moveTo(x, y);
                    } else {
                        ctx.lineTo(x, y);
                    }
                });
                ctx.stroke();
            }
            // Draw requests line
            if (requests && requests.length > 0) {
                ctx.strokeStyle = '#28a745';
                ctx.lineWidth = 1;
                ctx.setLineDash([5, 5]);
                ctx.beginPath();
                requests.forEach((point, index) => {
                    const x = 50 + (index * (width - 100) / requests.length);
                    const y = height - 50 - (point[1] * (height - 100) / Math.max(...requests.map(p => p[1])));
                    if (index === 0) {
                        ctx.moveTo(x, y);
                    } else {
                        ctx.lineTo(x, y);
                    }
                });
                ctx.stroke();
            }
            // Draw limits line
            if (limits && limits.length > 0) {
                ctx.strokeStyle = '#dc3545';
                ctx.lineWidth = 1;
                ctx.setLineDash([5, 5]);
                ctx.beginPath();
                limits.forEach((point, index) => {
                    const x = 50 + (index * (width - 100) / limits.length);
                    const y = height - 50 - (point[1] * (height - 100) / Math.max(...limits.map(p => p[1])));
                    if (index === 0) {
                        ctx.moveTo(x, y);
                    } else {
                        ctx.lineTo(x, y);
                    }
                });
                ctx.stroke();
            }
            // Reset line dash
            ctx.setLineDash([]);
            // Add labels
            ctx.fillStyle = '#333';
            ctx.font = '12px Arial';
            ctx.fillText(unit, 10, height / 2);
            ctx.fillText('Time', width / 2, height - 10);
        }
        function exportComplianceReport() {
            alert('Exporting compliance report...');