- Add CustomObjectsApi integration for VPA resources - Implement VPA CRUD operations (list, create, delete) - Add VPA recommendation collection via CRD - Add API endpoints for VPA management - Handle VPA installation detection gracefully - Complete TODO #1: CRD para VPA implementation
1546 lines
61 KiB
Python
1546 lines
61 KiB
Python
"""
|
|
API Routes
|
|
"""
|
|
import logging
|
|
from typing import List, Optional
|
|
from datetime import datetime
|
|
from fastapi import APIRouter, HTTPException, Depends, Request
|
|
from fastapi.responses import FileResponse
|
|
|
|
from app.models.resource_models import (
|
|
ClusterReport, NamespaceReport, ExportRequest,
|
|
ApplyRecommendationRequest, WorkloadCategory, SmartRecommendation,
|
|
PodHealthScore, SimplifiedValidation
|
|
)
|
|
from app.services.validation_service import ValidationService
|
|
from app.services.report_service import ReportService
|
|
from app.services.historical_analysis import HistoricalAnalysisService
|
|
from app.services.smart_recommendations import SmartRecommendationsService
|
|
from app.core.prometheus_client import PrometheusClient
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Create router
|
|
api_router = APIRouter()
|
|
|
|
# Initialize services
|
|
validation_service = ValidationService()
|
|
report_service = ReportService()
|
|
smart_recommendations_service = SmartRecommendationsService()
|
|
|
|
def get_k8s_client(request: Request):
|
|
"""Dependency to get Kubernetes client"""
|
|
return request.app.state.k8s_client
|
|
|
|
def get_prometheus_client(request: Request):
|
|
"""Dependency to get Prometheus client"""
|
|
return request.app.state.prometheus_client
|
|
|
|
def _extract_workload_name(pod_name: str) -> str:
|
|
"""Extract workload name from pod name (remove replica set suffix)"""
|
|
# Pod names typically follow pattern: workload-name-hash-suffix
|
|
# e.g., resource-governance-798b5579d6-7h298 -> resource-governance
|
|
parts = pod_name.split('-')
|
|
if len(parts) >= 3 and parts[-1].isalnum() and len(parts[-1]) == 5:
|
|
# Remove the last two parts (hash and suffix)
|
|
return '-'.join(parts[:-2])
|
|
elif len(parts) >= 2 and parts[-1].isalnum() and len(parts[-1]) == 5:
|
|
# Remove the last part (suffix)
|
|
return '-'.join(parts[:-1])
|
|
return pod_name
|
|
|
|
@api_router.get("/cluster/status")
|
|
async def get_cluster_status(
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get overall cluster status"""
|
|
try:
|
|
# Collect basic data
|
|
pods = await k8s_client.get_all_pods()
|
|
nodes_info = await k8s_client.get_nodes_info()
|
|
|
|
# Validate resources with historical analysis by workload (more reliable)
|
|
all_validations = []
|
|
|
|
# Group pods by namespace for workload analysis
|
|
namespace_pods = {}
|
|
for pod in pods:
|
|
if pod.namespace not in namespace_pods:
|
|
namespace_pods[pod.namespace] = []
|
|
namespace_pods[pod.namespace].append(pod)
|
|
|
|
# Analyze each namespace's workloads
|
|
for namespace, namespace_pod_list in namespace_pods.items():
|
|
try:
|
|
# Use workload-based analysis (more reliable than individual pods)
|
|
workload_validations = await validation_service.validate_workload_resources_with_historical_analysis(
|
|
namespace_pod_list, "24h"
|
|
)
|
|
all_validations.extend(workload_validations)
|
|
except Exception as e:
|
|
logger.warning(f"Error in workload analysis for namespace {namespace}: {e}")
|
|
# Fallback to individual pod analysis
|
|
for pod in namespace_pod_list:
|
|
try:
|
|
pod_validations = await validation_service.validate_pod_resources_with_historical_analysis(pod, "24h")
|
|
all_validations.extend(pod_validations)
|
|
except Exception as pod_e:
|
|
logger.warning(f"Error in historical analysis for pod {pod.name}: {pod_e}")
|
|
# Final fallback to static validations only
|
|
try:
|
|
static_validations = validation_service.validate_pod_resources(pod)
|
|
all_validations.extend(static_validations)
|
|
except Exception as static_e:
|
|
logger.error(f"Error in static validation for pod {pod.name}: {static_e}")
|
|
|
|
# Get overcommit information
|
|
overcommit_info = await prometheus_client.get_cluster_overcommit()
|
|
|
|
# Get VPA recommendations
|
|
vpa_recommendations = await k8s_client.get_vpa_recommendations()
|
|
|
|
# Group pods by namespace for the frontend
|
|
namespaces_data = {}
|
|
pod_validations_map = {}
|
|
|
|
# Create a map of pod validations (static + historical)
|
|
for validation in all_validations:
|
|
pod_key = f"{validation.namespace}/{validation.pod_name}"
|
|
if pod_key not in pod_validations_map:
|
|
pod_validations_map[pod_key] = []
|
|
pod_validations_map[pod_key].append(validation)
|
|
|
|
for pod in pods:
|
|
namespace = pod.namespace
|
|
if namespace not in namespaces_data:
|
|
namespaces_data[namespace] = {
|
|
'namespace': namespace,
|
|
'pods': {},
|
|
'total_validations': 0,
|
|
'severity_breakdown': {'error': 0, 'warning': 0, 'info': 0}
|
|
}
|
|
|
|
# Add pod to namespace
|
|
pod_name = pod.name
|
|
pod_key = f"{namespace}/{pod_name}"
|
|
pod_validations = pod_validations_map.get(pod_key, [])
|
|
|
|
# Convert pod to the format expected by frontend
|
|
pod_data = {
|
|
'pod_name': pod_name,
|
|
'namespace': namespace,
|
|
'phase': pod.phase,
|
|
'node_name': pod.node_name,
|
|
'containers': [],
|
|
'validations': []
|
|
}
|
|
|
|
# Add containers
|
|
for container in pod.containers:
|
|
container_data = {
|
|
'name': container['name'],
|
|
'image': container['image'],
|
|
'resources': container['resources']
|
|
}
|
|
pod_data['containers'].append(container_data)
|
|
|
|
# Add validations for this pod
|
|
for validation in pod_validations:
|
|
validation_data = {
|
|
'rule_name': validation.validation_type,
|
|
'namespace': namespace,
|
|
'message': validation.message,
|
|
'recommendation': validation.recommendation,
|
|
'severity': validation.severity
|
|
}
|
|
pod_data['validations'].append(validation_data)
|
|
|
|
# Update namespace severity breakdown
|
|
namespaces_data[namespace]['severity_breakdown'][validation.severity] += 1
|
|
namespaces_data[namespace]['total_validations'] += 1
|
|
|
|
namespaces_data[namespace]['pods'][pod_name] = pod_data
|
|
|
|
# Convert to list format expected by frontend
|
|
namespaces_list = list(namespaces_data.values())
|
|
|
|
# Count total errors and warnings
|
|
total_errors = sum(ns['severity_breakdown']['error'] for ns in namespaces_list)
|
|
total_warnings = sum(ns['severity_breakdown']['warning'] for ns in namespaces_list)
|
|
|
|
# Process overcommit information
|
|
cpu_overcommit_percent = 0
|
|
memory_overcommit_percent = 0
|
|
namespaces_in_overcommit = 0
|
|
resource_quota_coverage = 0
|
|
|
|
if overcommit_info and overcommit_info.get("cpu") and overcommit_info.get("memory"):
|
|
cpu_capacity = 0
|
|
cpu_requests = 0
|
|
memory_capacity = 0
|
|
memory_requests = 0
|
|
|
|
# Extract CPU data
|
|
if overcommit_info["cpu"].get("capacity", {}).get("status") == "success":
|
|
for result in overcommit_info["cpu"]["capacity"].get("data", {}).get("result", []):
|
|
cpu_capacity += float(result["value"][1])
|
|
|
|
if overcommit_info["cpu"].get("requests", {}).get("status") == "success":
|
|
for result in overcommit_info["cpu"]["requests"].get("data", {}).get("result", []):
|
|
cpu_requests += float(result["value"][1])
|
|
|
|
# Extract Memory data
|
|
if overcommit_info["memory"].get("capacity", {}).get("status") == "success":
|
|
for result in overcommit_info["memory"]["capacity"].get("data", {}).get("result", []):
|
|
memory_capacity += float(result["value"][1])
|
|
|
|
if overcommit_info["memory"].get("requests", {}).get("status") == "success":
|
|
for result in overcommit_info["memory"]["requests"].get("data", {}).get("result", []):
|
|
memory_requests += float(result["value"][1])
|
|
|
|
# Calculate overcommit percentages
|
|
if cpu_capacity > 0:
|
|
cpu_overcommit_percent = round((cpu_requests / cpu_capacity) * 100, 1)
|
|
|
|
if memory_capacity > 0:
|
|
memory_overcommit_percent = round((memory_requests / memory_capacity) * 100, 1)
|
|
|
|
# Debug logging
|
|
logger.info(f"Overcommit Debug - CPU Capacity: {cpu_capacity}, CPU Requests: {cpu_requests}, CPU Overcommit: {cpu_overcommit_percent}%")
|
|
logger.info(f"Overcommit Debug - Memory Capacity: {memory_capacity}, Memory Requests: {memory_requests}, Memory Overcommit: {memory_overcommit_percent}%")
|
|
|
|
# Count namespaces in overcommit (simplified - any namespace with requests > 0)
|
|
namespaces_in_overcommit = len([ns for ns in namespaces_list if ns['total_validations'] > 0])
|
|
|
|
# Calculate resource utilization (usage vs requests) - simplified
|
|
# This would ideally use actual usage data from Prometheus
|
|
resource_utilization = 0
|
|
if cpu_requests > 0 and memory_requests > 0:
|
|
# For now, we'll use a simplified calculation
|
|
# In a real implementation, this would compare actual usage vs requests
|
|
resource_utilization = 75 # Placeholder - would be calculated from real usage data
|
|
|
|
return {
|
|
"timestamp": datetime.now().isoformat(),
|
|
"total_pods": len(pods),
|
|
"total_namespaces": len(namespaces_list),
|
|
"total_nodes": len(nodes_info) if nodes_info else 0,
|
|
"total_errors": total_errors,
|
|
"total_warnings": total_warnings,
|
|
"namespaces": namespaces_list,
|
|
"overcommit": {
|
|
"cpu_overcommit_percent": cpu_overcommit_percent,
|
|
"memory_overcommit_percent": memory_overcommit_percent,
|
|
"namespaces_in_overcommit": namespaces_in_overcommit,
|
|
"resource_utilization": resource_utilization,
|
|
"cpu_capacity": cpu_capacity if 'cpu_capacity' in locals() else 0,
|
|
"cpu_requests": cpu_requests if 'cpu_requests' in locals() else 0,
|
|
"memory_capacity": memory_capacity if 'memory_capacity' in locals() else 0,
|
|
"memory_requests": memory_requests if 'memory_requests' in locals() else 0
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting cluster status: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/namespace/{namespace}/status")
|
|
async def get_namespace_status(
|
|
namespace: str,
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get status of a specific namespace"""
|
|
try:
|
|
# Collect namespace data
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
|
|
# Validate resources
|
|
all_validations = []
|
|
for pod in namespace_resources.pods:
|
|
pod_validations = validation_service.validate_pod_resources(pod)
|
|
all_validations.extend(pod_validations)
|
|
|
|
# Get resource usage from Prometheus
|
|
resource_usage = await prometheus_client.get_namespace_resource_usage(namespace)
|
|
|
|
# Generate namespace report
|
|
report = report_service.generate_namespace_report(
|
|
namespace=namespace,
|
|
pods=namespace_resources.pods,
|
|
validations=all_validations,
|
|
resource_usage=resource_usage
|
|
)
|
|
|
|
return report
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting namespace {namespace} status: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/pods")
|
|
async def get_pods(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""List pods with resource information"""
|
|
try:
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
return namespace_resources.pods
|
|
else:
|
|
return await k8s_client.get_all_pods()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing pods: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/validations")
|
|
async def get_validations(
|
|
namespace: Optional[str] = None,
|
|
severity: Optional[str] = None,
|
|
page: int = 1,
|
|
page_size: int = 50,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""List resource validations with pagination"""
|
|
try:
|
|
# Collect pods
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Validate resources
|
|
all_validations = []
|
|
for pod in pods:
|
|
pod_validations = validation_service.validate_pod_resources(pod)
|
|
all_validations.extend(pod_validations)
|
|
|
|
# Filter by severity if specified
|
|
if severity:
|
|
all_validations = [
|
|
v for v in all_validations if v.severity == severity
|
|
]
|
|
|
|
# Pagination
|
|
total = len(all_validations)
|
|
start = (page - 1) * page_size
|
|
end = start + page_size
|
|
paginated_validations = all_validations[start:end]
|
|
|
|
return {
|
|
"validations": paginated_validations,
|
|
"pagination": {
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"total": total,
|
|
"total_pages": (total + page_size - 1) // page_size
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting validations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/validations/by-namespace")
|
|
async def get_validations_by_namespace(
|
|
severity: Optional[str] = None,
|
|
page: int = 1,
|
|
page_size: int = 20,
|
|
include_system_namespaces: bool = False,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""List validations grouped by namespace with pagination"""
|
|
try:
|
|
# Collect all pods with system namespace filter
|
|
pods = await k8s_client.get_all_pods(include_system_namespaces=include_system_namespaces)
|
|
|
|
# Validate resources and group by namespace
|
|
namespace_validations = {}
|
|
for pod in pods:
|
|
pod_validations = validation_service.validate_pod_resources(pod)
|
|
|
|
if pod.namespace not in namespace_validations:
|
|
namespace_validations[pod.namespace] = {
|
|
"namespace": pod.namespace,
|
|
"pods": {},
|
|
"total_validations": 0,
|
|
"severity_breakdown": {"error": 0, "warning": 0, "info": 0, "critical": 0}
|
|
}
|
|
|
|
# Group validations by pod
|
|
if pod.name not in namespace_validations[pod.namespace]["pods"]:
|
|
namespace_validations[pod.namespace]["pods"][pod.name] = {
|
|
"pod_name": pod.name,
|
|
"validations": []
|
|
}
|
|
|
|
# Filter by severity if specified
|
|
if severity:
|
|
pod_validations = [v for v in pod_validations if v.severity == severity]
|
|
|
|
namespace_validations[pod.namespace]["pods"][pod.name]["validations"] = pod_validations
|
|
namespace_validations[pod.namespace]["total_validations"] += len(pod_validations)
|
|
|
|
# Count severities
|
|
for validation in pod_validations:
|
|
severity = validation.severity
|
|
if severity in namespace_validations[pod.namespace]["severity_breakdown"]:
|
|
namespace_validations[pod.namespace]["severity_breakdown"][severity] += 1
|
|
else:
|
|
# Handle unknown severity types
|
|
namespace_validations[pod.namespace]["severity_breakdown"]["info"] += 1
|
|
|
|
# Convert to list and sort by total validations
|
|
namespace_list = list(namespace_validations.values())
|
|
namespace_list.sort(key=lambda x: x["total_validations"], reverse=True)
|
|
|
|
# Pagination
|
|
total = len(namespace_list)
|
|
start = (page - 1) * page_size
|
|
end = start + page_size
|
|
paginated_namespaces = namespace_list[start:end]
|
|
|
|
return {
|
|
"namespaces": paginated_namespaces,
|
|
"pagination": {
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"total": total,
|
|
"total_pages": (total + page_size - 1) // page_size
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting validations by namespace: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/vpa/recommendations")
|
|
async def get_vpa_recommendations(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get VPA recommendations"""
|
|
try:
|
|
recommendations = await k8s_client.get_vpa_recommendations()
|
|
|
|
if namespace:
|
|
recommendations = [
|
|
r for r in recommendations if r.namespace == namespace
|
|
]
|
|
|
|
return recommendations
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting VPA recommendations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.post("/export")
|
|
async def export_report(
|
|
export_request: ExportRequest,
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Export report in different formats"""
|
|
try:
|
|
# Generate report
|
|
pods = await k8s_client.get_all_pods()
|
|
nodes_info = await k8s_client.get_nodes_info()
|
|
|
|
# Filter by namespaces if specified
|
|
if export_request.namespaces:
|
|
pods = [p for p in pods if p.namespace in export_request.namespaces]
|
|
|
|
# Validate resources
|
|
all_validations = []
|
|
for pod in pods:
|
|
pod_validations = validation_service.validate_pod_resources(pod)
|
|
all_validations.extend(pod_validations)
|
|
|
|
# Get additional information
|
|
overcommit_info = {}
|
|
vpa_recommendations = []
|
|
|
|
if export_request.include_vpa:
|
|
vpa_recommendations = await k8s_client.get_vpa_recommendations()
|
|
|
|
if export_request.include_validations:
|
|
overcommit_info = await prometheus_client.get_cluster_overcommit()
|
|
|
|
# Generate report
|
|
report = report_service.generate_cluster_report(
|
|
pods=pods,
|
|
validations=all_validations,
|
|
vpa_recommendations=vpa_recommendations,
|
|
overcommit_info=overcommit_info,
|
|
nodes_info=nodes_info
|
|
)
|
|
|
|
# Export
|
|
filepath = await report_service.export_report(report, export_request)
|
|
|
|
return {
|
|
"message": "Report exported successfully",
|
|
"filepath": filepath,
|
|
"format": export_request.format
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error exporting report: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/export/files")
|
|
async def list_exported_files():
|
|
"""List exported files"""
|
|
try:
|
|
files = report_service.get_exported_reports()
|
|
return files
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error listing exported files: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/export/files/{filename}")
|
|
async def download_exported_file(filename: str):
|
|
"""Download exported file"""
|
|
try:
|
|
files = report_service.get_exported_reports()
|
|
file_info = next((f for f in files if f["filename"] == filename), None)
|
|
|
|
if not file_info:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
return FileResponse(
|
|
path=file_info["filepath"],
|
|
filename=filename,
|
|
media_type='application/octet-stream'
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error downloading file {filename}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.post("/apply/recommendation")
|
|
async def apply_recommendation(
|
|
recommendation: ApplyRecommendationRequest,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Apply resource recommendation"""
|
|
try:
|
|
logger.info(f"Applying recommendation: {recommendation.action} {recommendation.resource_type} = {recommendation.value}")
|
|
|
|
if recommendation.dry_run:
|
|
return {
|
|
"message": "Dry run - recommendation would be applied",
|
|
"pod": recommendation.pod_name,
|
|
"namespace": recommendation.namespace,
|
|
"container": recommendation.container_name,
|
|
"action": f"{recommendation.action} {recommendation.resource_type} = {recommendation.value}"
|
|
}
|
|
else:
|
|
# Apply the recommendation by patching the deployment
|
|
result = await _apply_resource_patch(
|
|
recommendation.pod_name,
|
|
recommendation.namespace,
|
|
recommendation.container_name,
|
|
recommendation.resource_type,
|
|
recommendation.action,
|
|
recommendation.value,
|
|
k8s_client
|
|
)
|
|
|
|
return {
|
|
"message": "Recommendation applied successfully",
|
|
"pod": recommendation.pod_name,
|
|
"namespace": recommendation.namespace,
|
|
"container": recommendation.container_name,
|
|
"action": f"{recommendation.action} {recommendation.resource_type} = {recommendation.value}",
|
|
"result": result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying recommendation: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.post("/recommendations/apply")
|
|
async def apply_smart_recommendation(
|
|
recommendation: SmartRecommendation,
|
|
dry_run: bool = True,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Apply smart recommendation"""
|
|
try:
|
|
logger.info(f"Applying smart recommendation: {recommendation.title} for {recommendation.workload_name}")
|
|
|
|
if dry_run:
|
|
return {
|
|
"message": "Dry run - recommendation would be applied",
|
|
"workload": recommendation.workload_name,
|
|
"namespace": recommendation.namespace,
|
|
"type": recommendation.recommendation_type,
|
|
"priority": recommendation.priority,
|
|
"title": recommendation.title,
|
|
"description": recommendation.description,
|
|
"implementation_steps": recommendation.implementation_steps,
|
|
"kubectl_commands": recommendation.kubectl_commands,
|
|
"vpa_yaml": recommendation.vpa_yaml
|
|
}
|
|
|
|
# Apply recommendation based on type
|
|
if recommendation.recommendation_type == "vpa_activation":
|
|
result = await _apply_vpa_recommendation(recommendation, k8s_client)
|
|
elif recommendation.recommendation_type == "resource_config":
|
|
result = await _apply_resource_config_recommendation(recommendation, k8s_client)
|
|
elif recommendation.recommendation_type == "ratio_adjustment":
|
|
result = await _apply_ratio_adjustment_recommendation(recommendation, k8s_client)
|
|
else:
|
|
raise HTTPException(status_code=400, detail=f"Unknown recommendation type: {recommendation.recommendation_type}")
|
|
|
|
return {
|
|
"message": "Smart recommendation applied successfully",
|
|
"workload": recommendation.workload_name,
|
|
"namespace": recommendation.namespace,
|
|
"type": recommendation.recommendation_type,
|
|
"result": result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying smart recommendation: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
async def _apply_resource_patch(
|
|
pod_name: str,
|
|
namespace: str,
|
|
container_name: str,
|
|
resource_type: str,
|
|
action: str,
|
|
value: str,
|
|
k8s_client
|
|
) -> dict:
|
|
"""Apply resource patch to deployment"""
|
|
try:
|
|
# Get the deployment name from pod name
|
|
deployment_name = _extract_deployment_name(pod_name)
|
|
|
|
# Create patch body
|
|
patch_body = {
|
|
"spec": {
|
|
"template": {
|
|
"spec": {
|
|
"containers": [{
|
|
"name": container_name,
|
|
"resources": {
|
|
action: {
|
|
resource_type: value
|
|
}
|
|
}
|
|
}]
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
# Apply patch
|
|
result = await k8s_client.patch_deployment(deployment_name, namespace, patch_body)
|
|
|
|
return {
|
|
"deployment": deployment_name,
|
|
"namespace": namespace,
|
|
"container": container_name,
|
|
"resource_type": resource_type,
|
|
"action": action,
|
|
"value": value,
|
|
"result": result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying resource patch: {e}")
|
|
raise
|
|
|
|
async def _apply_vpa_recommendation(recommendation: SmartRecommendation, k8s_client) -> dict:
|
|
"""Apply VPA activation recommendation"""
|
|
try:
|
|
if not recommendation.vpa_yaml:
|
|
raise ValueError("VPA YAML not provided in recommendation")
|
|
|
|
# Apply VPA YAML
|
|
result = await k8s_client.apply_yaml(recommendation.vpa_yaml, recommendation.namespace)
|
|
|
|
return {
|
|
"type": "vpa_activation",
|
|
"workload": recommendation.workload_name,
|
|
"namespace": recommendation.namespace,
|
|
"vpa_yaml_applied": True,
|
|
"result": result
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying VPA recommendation: {e}")
|
|
raise
|
|
|
|
async def _apply_resource_config_recommendation(recommendation: SmartRecommendation, k8s_client) -> dict:
|
|
"""Apply resource configuration recommendation"""
|
|
try:
|
|
# For now, return the kubectl commands that should be executed
|
|
# In a real implementation, these would be executed via the Kubernetes client
|
|
|
|
return {
|
|
"type": "resource_config",
|
|
"workload": recommendation.workload_name,
|
|
"namespace": recommendation.namespace,
|
|
"kubectl_commands": recommendation.kubectl_commands,
|
|
"message": "Resource configuration commands prepared for execution"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying resource config recommendation: {e}")
|
|
raise
|
|
|
|
async def _apply_ratio_adjustment_recommendation(recommendation: SmartRecommendation, k8s_client) -> dict:
|
|
"""Apply ratio adjustment recommendation"""
|
|
try:
|
|
# For now, return the kubectl commands that should be executed
|
|
# In a real implementation, these would be executed via the Kubernetes client
|
|
|
|
return {
|
|
"type": "ratio_adjustment",
|
|
"workload": recommendation.workload_name,
|
|
"namespace": recommendation.namespace,
|
|
"kubectl_commands": recommendation.kubectl_commands,
|
|
"message": "Ratio adjustment commands prepared for execution"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying ratio adjustment recommendation: {e}")
|
|
raise
|
|
|
|
def _extract_deployment_name(pod_name: str) -> str:
|
|
"""Extract deployment name from pod name"""
|
|
# Remove replica set suffix (e.g., "app-74ffb8c66-9kpdg" -> "app")
|
|
parts = pod_name.split('-')
|
|
if len(parts) >= 3 and parts[-2].isalnum() and parts[-1].isalnum():
|
|
return '-'.join(parts[:-2])
|
|
return pod_name
|
|
|
|
@api_router.get("/validations/historical")
|
|
async def get_historical_validations(
|
|
namespace: Optional[str] = None,
|
|
time_range: str = "24h",
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get validations with historical analysis from Prometheus"""
|
|
try:
|
|
validation_service = ValidationService()
|
|
|
|
# Collect pods
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Validate with historical analysis
|
|
all_validations = []
|
|
for pod in pods:
|
|
pod_validations = await validation_service.validate_pod_resources_with_historical_analysis(
|
|
pod, time_range
|
|
)
|
|
all_validations.extend(pod_validations)
|
|
|
|
return {
|
|
"validations": all_validations,
|
|
"total": len(all_validations),
|
|
"time_range": time_range,
|
|
"namespace": namespace or "all"
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical validations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/workloads/{namespace}/{workload}/metrics")
|
|
async def get_workload_historical_metrics(
|
|
namespace: str,
|
|
workload: str,
|
|
time_range: str = "24h"
|
|
):
|
|
"""Get historical metrics for a specific workload with cluster percentages"""
|
|
try:
|
|
prometheus_client = PrometheusClient()
|
|
await prometheus_client.initialize()
|
|
|
|
# Get cluster total resources first
|
|
cluster_cpu_query = 'sum(kube_node_status_allocatable{resource="cpu"})'
|
|
cluster_memory_query = 'sum(kube_node_status_allocatable{resource="memory"})'
|
|
|
|
cluster_cpu_data = await prometheus_client.query(cluster_cpu_query)
|
|
cluster_memory_data = await prometheus_client.query(cluster_memory_query)
|
|
|
|
# Extract cluster totals
|
|
cluster_cpu_total = 0
|
|
cluster_memory_total = 0
|
|
|
|
if cluster_cpu_data.get("status") == "success" and cluster_cpu_data.get("data", {}).get("result"):
|
|
for result in cluster_cpu_data["data"]["result"]:
|
|
cluster_cpu_total += float(result["value"][1])
|
|
|
|
if cluster_memory_data.get("status") == "success" and cluster_memory_data.get("data", {}).get("result"):
|
|
for result in cluster_memory_data["data"]["result"]:
|
|
cluster_memory_total += float(result["value"][1])
|
|
|
|
# Get workload-specific metrics using more precise queries
|
|
# CPU usage for specific pod (using regex pattern to match pod name with suffix)
|
|
cpu_usage_query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod=~"{workload}.*"}}[5m])'
|
|
memory_usage_query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod=~"{workload}.*", container!="", image!=""}}'
|
|
|
|
# Resource requests and limits for specific pod
|
|
cpu_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~"{workload}.*", resource="cpu"}})'
|
|
memory_requests_query = f'sum(kube_pod_container_resource_requests{{namespace="{namespace}", pod=~"{workload}.*", resource="memory"}})'
|
|
cpu_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~"{workload}.*", resource="cpu"}})'
|
|
memory_limits_query = f'sum(kube_pod_container_resource_limits{{namespace="{namespace}", pod=~"{workload}.*", resource="memory"}})'
|
|
|
|
# Execute queries
|
|
cpu_usage_data = await prometheus_client.query(cpu_usage_query)
|
|
memory_usage_data = await prometheus_client.query(memory_usage_query)
|
|
cpu_requests_data = await prometheus_client.query(cpu_requests_query)
|
|
memory_requests_data = await prometheus_client.query(memory_requests_query)
|
|
cpu_limits_data = await prometheus_client.query(cpu_limits_query)
|
|
memory_limits_data = await prometheus_client.query(memory_limits_query)
|
|
|
|
# Extract values
|
|
cpu_usage = 0
|
|
memory_usage = 0
|
|
cpu_requests = 0
|
|
memory_requests = 0
|
|
cpu_limits = 0
|
|
memory_limits = 0
|
|
|
|
# Extract CPU usage
|
|
if cpu_usage_data.get("status") == "success" and cpu_usage_data.get("data", {}).get("result"):
|
|
for result in cpu_usage_data["data"]["result"]:
|
|
cpu_usage += float(result["value"][1])
|
|
|
|
# Extract Memory usage
|
|
if memory_usage_data.get("status") == "success" and memory_usage_data.get("data", {}).get("result"):
|
|
for result in memory_usage_data["data"]["result"]:
|
|
memory_usage += float(result["value"][1])
|
|
|
|
# Extract CPU requests
|
|
if cpu_requests_data.get("status") == "success" and cpu_requests_data.get("data", {}).get("result"):
|
|
for result in cpu_requests_data["data"]["result"]:
|
|
cpu_requests += float(result["value"][1])
|
|
|
|
# Extract Memory requests
|
|
if memory_requests_data.get("status") == "success" and memory_requests_data.get("data", {}).get("result"):
|
|
for result in memory_requests_data["data"]["result"]:
|
|
memory_requests += float(result["value"][1])
|
|
|
|
# Extract CPU limits
|
|
if cpu_limits_data.get("status") == "success" and cpu_limits_data.get("data", {}).get("result"):
|
|
for result in cpu_limits_data["data"]["result"]:
|
|
cpu_limits += float(result["value"][1])
|
|
|
|
# Extract Memory limits
|
|
if memory_limits_data.get("status") == "success" and memory_limits_data.get("data", {}).get("result"):
|
|
for result in memory_limits_data["data"]["result"]:
|
|
memory_limits += float(result["value"][1])
|
|
|
|
# Check if we have real data
|
|
prometheus_available = cluster_cpu_total > 0 and cluster_memory_total > 0
|
|
|
|
# If no real data, return zeros with appropriate message
|
|
if not prometheus_available:
|
|
return {
|
|
"workload": workload,
|
|
"namespace": namespace,
|
|
"time_range": time_range,
|
|
"prometheus_available": False,
|
|
"data_source": "no_data",
|
|
"message": "No metrics data available for this workload",
|
|
"cluster_total": {
|
|
"cpu_cores": 0,
|
|
"memory_bytes": 0,
|
|
"memory_gb": 0
|
|
},
|
|
"workload_metrics": {
|
|
"cpu": {
|
|
"usage_cores": 0,
|
|
"usage_percent": 0,
|
|
"requests_cores": 0,
|
|
"requests_percent": 0,
|
|
"limits_cores": 0,
|
|
"limits_percent": 0,
|
|
"efficiency_percent": 0
|
|
},
|
|
"memory": {
|
|
"usage_bytes": 0,
|
|
"usage_mb": 0,
|
|
"usage_percent": 0,
|
|
"requests_bytes": 0,
|
|
"requests_mb": 0,
|
|
"requests_percent": 0,
|
|
"limits_bytes": 0,
|
|
"limits_mb": 0,
|
|
"limits_percent": 0,
|
|
"efficiency_percent": 0
|
|
}
|
|
}
|
|
}
|
|
|
|
# Calculate percentages
|
|
cpu_usage_percent = (cpu_usage / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
|
memory_usage_percent = (memory_usage / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
|
|
cpu_requests_percent = (cpu_requests / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
|
memory_requests_percent = (memory_requests / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
|
|
cpu_limits_percent = (cpu_limits / cluster_cpu_total * 100) if cluster_cpu_total > 0 else 0
|
|
memory_limits_percent = (memory_limits / cluster_memory_total * 100) if cluster_memory_total > 0 else 0
|
|
|
|
# Calculate efficiency (usage vs requests)
|
|
cpu_efficiency = (cpu_usage / cpu_requests * 100) if cpu_requests > 0 else 0
|
|
memory_efficiency = (memory_usage / memory_requests * 100) if memory_requests > 0 else 0
|
|
|
|
return {
|
|
"workload": workload,
|
|
"namespace": namespace,
|
|
"time_range": time_range,
|
|
"prometheus_available": True,
|
|
"data_source": "prometheus",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"cluster_total": {
|
|
"cpu_cores": cluster_cpu_total,
|
|
"memory_bytes": cluster_memory_total,
|
|
"memory_gb": cluster_memory_total / (1024**3)
|
|
},
|
|
"workload_metrics": {
|
|
"cpu": {
|
|
"usage_cores": cpu_usage,
|
|
"usage_percent": round(cpu_usage_percent, 2),
|
|
"requests_cores": cpu_requests,
|
|
"requests_percent": round(cpu_requests_percent, 2),
|
|
"limits_cores": cpu_limits,
|
|
"limits_percent": round(cpu_limits_percent, 2),
|
|
"efficiency_percent": round(cpu_efficiency, 1)
|
|
},
|
|
"memory": {
|
|
"usage_bytes": memory_usage,
|
|
"usage_mb": round(memory_usage / (1024**2), 2),
|
|
"usage_percent": round(memory_usage_percent, 2),
|
|
"requests_bytes": memory_requests,
|
|
"requests_mb": round(memory_requests / (1024**2), 2),
|
|
"requests_percent": round(memory_requests_percent, 2),
|
|
"limits_bytes": memory_limits,
|
|
"limits_mb": round(memory_limits / (1024**2), 2),
|
|
"limits_percent": round(memory_limits_percent, 2),
|
|
"efficiency_percent": round(memory_efficiency, 1)
|
|
}
|
|
},
|
|
"promql_queries": {
|
|
"cluster_cpu_total": cluster_cpu_query,
|
|
"cluster_memory_total": cluster_memory_query,
|
|
"cpu_usage": cpu_usage_query,
|
|
"memory_usage": memory_usage_query,
|
|
"cpu_requests": cpu_requests_query,
|
|
"memory_requests": memory_requests_query,
|
|
"cpu_limits": cpu_limits_query,
|
|
"memory_limits": memory_limits_query
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting workload metrics for {namespace}/{workload}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/cluster/historical-summary")
|
|
async def get_cluster_historical_summary(
|
|
time_range: str = "24h"
|
|
):
|
|
"""Get cluster historical summary"""
|
|
try:
|
|
historical_service = HistoricalAnalysisService()
|
|
summary = await historical_service.get_cluster_historical_summary(time_range)
|
|
|
|
return {
|
|
"summary": summary,
|
|
"time_range": time_range,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical summary: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/namespace/{namespace}/historical-analysis")
|
|
async def get_namespace_historical_analysis(
|
|
namespace: str,
|
|
time_range: str = "24h",
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get historical analysis for a specific namespace"""
|
|
try:
|
|
historical_service = HistoricalAnalysisService()
|
|
|
|
# Get historical analysis for the namespace
|
|
analysis = await historical_service.get_namespace_historical_analysis(
|
|
namespace, time_range, k8s_client
|
|
)
|
|
|
|
return {
|
|
"namespace": namespace,
|
|
"time_range": time_range,
|
|
"analysis": analysis,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical analysis for namespace {namespace}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/namespace/{namespace}/workload/{workload}/historical-analysis")
|
|
async def get_workload_historical_analysis(
|
|
namespace: str,
|
|
workload: str,
|
|
time_range: str = "24h",
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get historical analysis for a specific workload/deployment"""
|
|
try:
|
|
historical_service = HistoricalAnalysisService()
|
|
|
|
# Get historical analysis for the workload
|
|
analysis = await historical_service.get_workload_historical_analysis(
|
|
namespace, workload, time_range
|
|
)
|
|
|
|
return {
|
|
"namespace": namespace,
|
|
"workload": workload,
|
|
"time_range": time_range,
|
|
"analysis": analysis,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical analysis for workload {workload} in namespace {namespace}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/namespace/{namespace}/pod/{pod_name}/historical-analysis")
|
|
async def get_pod_historical_analysis(
|
|
namespace: str,
|
|
pod_name: str,
|
|
time_range: str = "24h",
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get historical analysis for a specific pod (legacy endpoint)"""
|
|
try:
|
|
historical_service = HistoricalAnalysisService()
|
|
|
|
# Get historical analysis for the pod
|
|
analysis = await historical_service.get_pod_historical_analysis(
|
|
namespace, pod_name, time_range
|
|
)
|
|
|
|
return {
|
|
"namespace": namespace,
|
|
"pod_name": pod_name,
|
|
"time_range": time_range,
|
|
"analysis": analysis,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical analysis for pod {pod_name} in namespace {namespace}: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/smart-recommendations")
|
|
async def get_smart_recommendations(
|
|
namespace: Optional[str] = None,
|
|
priority: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get smart recommendations for workloads"""
|
|
try:
|
|
# Collect pods
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Get workload categories
|
|
categories = await validation_service.get_workload_categories(pods)
|
|
|
|
# Get smart recommendations
|
|
recommendations = await validation_service.get_smart_recommendations(pods)
|
|
|
|
# Filter by priority if specified
|
|
if priority:
|
|
recommendations = [
|
|
r for r in recommendations if r.priority == priority
|
|
]
|
|
|
|
return {
|
|
"recommendations": recommendations,
|
|
"categories": categories,
|
|
"total": len(recommendations)
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting smart recommendations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/workload-categories")
|
|
async def get_workload_categories(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get workload categories analysis"""
|
|
try:
|
|
# Collect pods
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Get workload categories
|
|
categories = await validation_service.get_workload_categories(pods)
|
|
|
|
# Group by category
|
|
category_summary = {}
|
|
for category in categories:
|
|
cat_type = category.category
|
|
if cat_type not in category_summary:
|
|
category_summary[cat_type] = {
|
|
"count": 0,
|
|
"total_priority_score": 0,
|
|
"workloads": []
|
|
}
|
|
|
|
category_summary[cat_type]["count"] += 1
|
|
category_summary[cat_type]["total_priority_score"] += category.priority_score
|
|
category_summary[cat_type]["workloads"].append({
|
|
"name": category.workload_name,
|
|
"namespace": category.namespace,
|
|
"priority_score": category.priority_score,
|
|
"estimated_impact": category.estimated_impact,
|
|
"vpa_candidate": category.vpa_candidate
|
|
})
|
|
|
|
# Calculate average priority scores
|
|
for cat_type in category_summary:
|
|
if category_summary[cat_type]["count"] > 0:
|
|
category_summary[cat_type]["average_priority_score"] = (
|
|
category_summary[cat_type]["total_priority_score"] /
|
|
category_summary[cat_type]["count"]
|
|
)
|
|
|
|
return {
|
|
"categories": category_summary,
|
|
"total_workloads": len(categories),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting workload categories: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/validations/smart")
|
|
async def get_smart_validations(
|
|
namespace: Optional[str] = None,
|
|
severity: Optional[str] = None,
|
|
workload_category: Optional[str] = None,
|
|
page: int = 1,
|
|
page_size: int = 50,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get validations with smart analysis and categorization"""
|
|
try:
|
|
# Collect pods
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Get smart validations
|
|
all_validations = []
|
|
for pod in pods:
|
|
pod_validations = await validation_service.validate_pod_resources_with_smart_analysis(pod)
|
|
all_validations.extend(pod_validations)
|
|
|
|
# Filter by severity if specified
|
|
if severity:
|
|
all_validations = [
|
|
v for v in all_validations if v.severity == severity
|
|
]
|
|
|
|
# Filter by workload category if specified
|
|
if workload_category:
|
|
all_validations = [
|
|
v for v in all_validations if v.workload_category == workload_category
|
|
]
|
|
|
|
# Sort by priority score (descending)
|
|
all_validations.sort(key=lambda x: x.priority_score or 0, reverse=True)
|
|
|
|
# Pagination
|
|
total = len(all_validations)
|
|
start = (page - 1) * page_size
|
|
end = start + page_size
|
|
paginated_validations = all_validations[start:end]
|
|
|
|
return {
|
|
"validations": paginated_validations,
|
|
"pagination": {
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"total": total,
|
|
"total_pages": (total + page_size - 1) // page_size
|
|
},
|
|
"summary": {
|
|
"total_validations": total,
|
|
"by_severity": {
|
|
"critical": len([v for v in all_validations if v.severity == "critical"]),
|
|
"error": len([v for v in all_validations if v.severity == "error"]),
|
|
"warning": len([v for v in all_validations if v.severity == "warning"]),
|
|
"info": len([v for v in all_validations if v.severity == "info"])
|
|
},
|
|
"by_category": {
|
|
"new": len([v for v in all_validations if v.workload_category == "new"]),
|
|
"established": len([v for v in all_validations if v.workload_category == "established"]),
|
|
"outlier": len([v for v in all_validations if v.workload_category == "outlier"]),
|
|
"compliant": len([v for v in all_validations if v.workload_category == "compliant"])
|
|
}
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting smart validations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/cluster-health")
|
|
async def get_cluster_health(k8s_client=Depends(get_k8s_client)):
|
|
"""Get cluster health overview with overcommit analysis"""
|
|
try:
|
|
pods = await k8s_client.get_all_pods()
|
|
cluster_health = await validation_service.get_cluster_health(pods)
|
|
return cluster_health
|
|
except Exception as e:
|
|
logger.error(f"Error getting cluster health: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/qos-classification")
|
|
async def get_qos_classification(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get QoS classification for pods"""
|
|
try:
|
|
if namespace:
|
|
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
|
pods = namespace_resources.pods
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
qos_classifications = []
|
|
for pod in pods:
|
|
qos = validation_service.classify_qos(pod)
|
|
qos_classifications.append(qos)
|
|
|
|
return {
|
|
"qos_classifications": qos_classifications,
|
|
"total_pods": len(pods),
|
|
"distribution": {
|
|
"Guaranteed": len([q for q in qos_classifications if q.qos_class == "Guaranteed"]),
|
|
"Burstable": len([q for q in qos_classifications if q.qos_class == "Burstable"]),
|
|
"BestEffort": len([q for q in qos_classifications if q.qos_class == "BestEffort"])
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting QoS classification: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/resource-quotas")
|
|
async def get_resource_quotas(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get Resource Quota analysis"""
|
|
try:
|
|
if namespace:
|
|
namespaces = [namespace]
|
|
else:
|
|
pods = await k8s_client.get_all_pods()
|
|
namespaces = list(set(pod.namespace for pod in pods))
|
|
|
|
quotas = await validation_service.analyze_resource_quotas(namespaces)
|
|
|
|
return {
|
|
"resource_quotas": quotas,
|
|
"total_namespaces": len(namespaces),
|
|
"coverage_percentage": len([q for q in quotas if q.status == "Active"]) / len(namespaces) * 100
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting resource quotas: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/pod-health-scores")
|
|
async def get_pod_health_scores(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get simplified pod health scores with grouped validations"""
|
|
try:
|
|
# Get pods
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
if namespace:
|
|
pods = [pod for pod in pods if pod.namespace == namespace]
|
|
|
|
health_scores = []
|
|
|
|
for pod in pods:
|
|
# Get validations for this pod
|
|
pod_validations = validation_service.validate_pod_resources(pod)
|
|
|
|
# Calculate health score
|
|
health_score = validation_service.calculate_pod_health_score(pod, pod_validations)
|
|
health_scores.append(health_score)
|
|
|
|
# Sort by health score (worst first)
|
|
health_scores.sort(key=lambda x: x.health_score)
|
|
|
|
return {
|
|
"pods": health_scores,
|
|
"total_pods": len(health_scores),
|
|
"summary": {
|
|
"excellent": len([h for h in health_scores if h.health_score >= 9]),
|
|
"good": len([h for h in health_scores if 7 <= h.health_score < 9]),
|
|
"medium": len([h for h in health_scores if 5 <= h.health_score < 7]),
|
|
"poor": len([h for h in health_scores if 3 <= h.health_score < 5]),
|
|
"critical": len([h for h in health_scores if h.health_score < 3])
|
|
}
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting pod health scores: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/smart-recommendations")
|
|
async def get_smart_recommendations(
|
|
namespace: Optional[str] = None,
|
|
priority: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Get smart recommendations for resource optimization"""
|
|
try:
|
|
# Get all pods
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
if namespace:
|
|
pods = [pod for pod in pods if pod.namespace == namespace]
|
|
|
|
# Categorize workloads
|
|
categories = await smart_recommendations_service.categorize_workloads(pods)
|
|
|
|
# Generate smart recommendations
|
|
recommendations = await smart_recommendations_service.generate_smart_recommendations(pods, categories)
|
|
|
|
# Filter by priority if specified
|
|
if priority:
|
|
recommendations = [r for r in recommendations if r.priority == priority]
|
|
|
|
# Group by namespace
|
|
recommendations_by_namespace = {}
|
|
for rec in recommendations:
|
|
if rec.namespace not in recommendations_by_namespace:
|
|
recommendations_by_namespace[rec.namespace] = []
|
|
recommendations_by_namespace[rec.namespace].append(rec)
|
|
|
|
# Calculate summary
|
|
summary = {
|
|
"total_recommendations": len(recommendations),
|
|
"by_priority": {
|
|
"critical": len([r for r in recommendations if r.priority == "critical"]),
|
|
"high": len([r for r in recommendations if r.priority == "high"]),
|
|
"medium": len([r for r in recommendations if r.priority == "medium"]),
|
|
"low": len([r for r in recommendations if r.priority == "low"])
|
|
},
|
|
"by_type": {
|
|
"resource_config": len([r for r in recommendations if r.recommendation_type == "resource_config"]),
|
|
"vpa_activation": len([r for r in recommendations if r.recommendation_type == "vpa_activation"]),
|
|
"ratio_adjustment": len([r for r in recommendations if r.recommendation_type == "ratio_adjustment"])
|
|
},
|
|
"namespaces_affected": len(recommendations_by_namespace)
|
|
}
|
|
|
|
return {
|
|
"recommendations": recommendations,
|
|
"categories": categories,
|
|
"grouped_by_namespace": recommendations_by_namespace,
|
|
"summary": summary,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting smart recommendations: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/historical-analysis")
|
|
async def get_historical_analysis(
|
|
time_range: str = "24h",
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get historical analysis for all workloads"""
|
|
try:
|
|
# Get all pods
|
|
pods = await k8s_client.get_all_pods()
|
|
|
|
# Group pods by workload
|
|
workloads = {}
|
|
for pod in pods:
|
|
# Extract workload name from pod name (remove replica set suffix)
|
|
workload_name = _extract_workload_name(pod.name)
|
|
namespace = pod.namespace
|
|
|
|
if workload_name not in workloads:
|
|
workloads[workload_name] = {
|
|
'name': workload_name,
|
|
'namespace': namespace,
|
|
'pods': []
|
|
}
|
|
workloads[workload_name]['pods'].append(pod)
|
|
|
|
# Convert to list and add basic info
|
|
workload_list = []
|
|
for workload_name, workload_data in workloads.items():
|
|
workload_list.append({
|
|
'name': workload_name,
|
|
'namespace': workload_data['namespace'],
|
|
'pod_count': len(workload_data['pods']),
|
|
'cpu_usage': 'N/A', # Will be populated by Prometheus queries
|
|
'memory_usage': 'N/A', # Will be populated by Prometheus queries
|
|
'last_updated': datetime.now().isoformat()
|
|
})
|
|
|
|
return {
|
|
"workloads": workload_list,
|
|
"total_workloads": len(workload_list),
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting historical analysis: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=f"Error getting historical analysis: {str(e)}")
|
|
|
|
@api_router.get("/historical-analysis/{namespace}/{workload}")
|
|
async def get_workload_historical_details(
|
|
namespace: str,
|
|
workload: str,
|
|
time_range: str = "24h",
|
|
k8s_client=Depends(get_k8s_client),
|
|
prometheus_client=Depends(get_prometheus_client)
|
|
):
|
|
"""Get detailed historical analysis for a specific workload"""
|
|
try:
|
|
# Get all pods and filter by namespace and workload
|
|
all_pods = await k8s_client.get_all_pods()
|
|
workload_pods = [
|
|
pod for pod in all_pods
|
|
if pod.namespace == namespace and _extract_workload_name(pod.name) == workload
|
|
]
|
|
|
|
if not workload_pods:
|
|
raise HTTPException(status_code=404, detail=f"Workload {workload} not found in namespace {namespace}")
|
|
|
|
# Get historical data from Prometheus
|
|
historical_service = HistoricalAnalysisService()
|
|
|
|
# Get CPU and memory usage over time
|
|
cpu_data = await historical_service.get_cpu_usage_history(namespace, workload, time_range)
|
|
memory_data = await historical_service.get_memory_usage_history(namespace, workload, time_range)
|
|
|
|
# Generate recommendations
|
|
recommendations = await historical_service.generate_recommendations(namespace, workload)
|
|
|
|
return {
|
|
"workload": workload,
|
|
"namespace": namespace,
|
|
"cpu_data": cpu_data,
|
|
"memory_data": memory_data,
|
|
"recommendations": recommendations,
|
|
"timestamp": datetime.now().isoformat()
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
logger.error(f"Error getting workload historical details: {str(e)}")
|
|
raise HTTPException(status_code=500, detail=f"Error getting workload details: {str(e)}")
|
|
|
|
@api_router.get("/vpa/list")
|
|
async def list_vpas(
|
|
namespace: Optional[str] = None,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""List VPA resources"""
|
|
try:
|
|
vpas = await k8s_client.list_vpas(namespace)
|
|
return {
|
|
"vpas": vpas,
|
|
"count": len(vpas),
|
|
"namespace": namespace or "all"
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error listing VPAs: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.post("/vpa/create")
|
|
async def create_vpa(
|
|
namespace: str,
|
|
vpa_manifest: dict,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Create a VPA resource"""
|
|
try:
|
|
result = await k8s_client.create_vpa(namespace, vpa_manifest)
|
|
return {
|
|
"message": "VPA created successfully",
|
|
"vpa": result,
|
|
"namespace": namespace
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error creating VPA: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.delete("/vpa/{vpa_name}")
|
|
async def delete_vpa(
|
|
vpa_name: str,
|
|
namespace: str,
|
|
k8s_client=Depends(get_k8s_client)
|
|
):
|
|
"""Delete a VPA resource"""
|
|
try:
|
|
result = await k8s_client.delete_vpa(vpa_name, namespace)
|
|
return {
|
|
"message": "VPA deleted successfully",
|
|
"vpa_name": vpa_name,
|
|
"namespace": namespace
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error deleting VPA: {e}")
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
@api_router.get("/health")
|
|
async def health_check():
|
|
"""API health check"""
|
|
return {
|
|
"status": "healthy",
|
|
"service": "resource-governance-api",
|
|
"version": "1.0.0"
|
|
}
|