diff --git a/DOCUMENTATION.md b/DOCUMENTATION.md index d2cd3cc..51ca4f1 100644 --- a/DOCUMENTATION.md +++ b/DOCUMENTATION.md @@ -1,42 +1,71 @@ -# 📚 Documentação - OpenShift Resource Governance Tool +# 📚 Documentation - OpenShift Resource Governance Tool -## 🎯 Documentação Atual +## 🎯 Current Documentation -### ✅ **Documentação Ativa (Atualizada)** -- **[README.md](README.md)** - 📖 **Documentação principal e completa** -- **[AIAgents-Support.md](AIAgents-Support.md)** - 🤖 Suporte para agentes de IA +### ✅ **Active Documentation (Updated)** +- **[README.md](README.md)** - 📖 **Main and complete documentation** +- **[AIAgents-Support.md](AIAgents-Support.md)** - 🤖 AI agents support -### ✅ **Documentação Limpa e Organizada** -Todos os arquivos desatualizados foram removidos para manter apenas a documentação atual e relevante. +### ✅ **Clean and Organized Documentation** +All outdated files have been removed to maintain only current and relevant documentation. -## 🚀 Como Usar +## 🚀 How to Use -### Para Usuários -1. **Leia o [README.md](README.md)** - Documentação completa e atualizada -2. **Use o script de deploy**: `./scripts/deploy-complete.sh` +### For Users +1. **Read [README.md](README.md)** - Complete and updated documentation +2. **Use deploy script**: `./scripts/deploy-complete.sh` -### Para Desenvolvedores -1. **Leia o [README.md](README.md)** - Instruções de desenvolvimento -2. **Consulte [AIAgents-Support.md](AIAgents-Support.md)** - Contexto do projeto +### For Developers +1. **Read [README.md](README.md)** - Development instructions +2. **Consult [AIAgents-Support.md](AIAgents-Support.md)** - Project context -## 📋 Status da Documentação +## 📋 Documentation Status -| Arquivo | Status | Última Atualização | Observações | -|---------|--------|-------------------|-------------| -| README.md | ✅ Ativo | 2025-01-27 | Documentação principal atualizada | -| AIAgents-Support.md | ✅ Ativo | 2025-01-27 | Suporte para IA agents | -| DOCUMENTATION.md | ✅ Ativo | 2025-01-27 | Índice da documentação | +| File | Status | Last Update | Notes | +|------|--------|-------------|-------| +| README.md | ✅ Active | 2025-01-29 | Main documentation with pragmatic roadmap | +| AIAgents-Support.md | ✅ Active | 2025-01-29 | AI agents support and project context | +| DOCUMENTATION.md | ✅ Active | 2025-01-29 | Documentation index | -**Arquivos removidos:** 6 arquivos desatualizados foram removidos para manter a documentação limpa e organizada. +**Removed files:** 6 outdated files were removed to keep documentation clean and organized. -## 🔄 Próximos Passos +## 🎯 **PRAGMATIC ROADMAP - Resource Governance Focus** -1. **Manter README.md atualizado** como fonte única da verdade -2. **Atualizar AIAgents-Support.md** conforme necessário -3. **Manter DOCUMENTATION.md** como índice da documentação +**Core Mission**: List projects without requests/limits + provide smart recommendations based on historical analysis + VPA integration -## 📞 Suporte +### **Phase 1: Enhanced Validation & Categorization (IMMEDIATE - 1-2 weeks)** +- Enhanced Validation Engine with better categorization +- Workload Categorization System (New, Established, Outlier, Compliant) +- Smart Historical Analysis integration -- **Documentação principal**: [README.md](README.md) +### **Phase 2: Smart Recommendations Engine (SHORT TERM - 2-3 weeks)** +- Dedicated Recommendations Section +- Resource Configuration Recommendations +- VPA Activation Recommendations +- Priority Scoring System + +### **Phase 3: VPA Integration & Automation (MEDIUM TERM - 3-4 weeks)** +- VPA Status Detection & Management +- Smart VPA Activation +- VPA Recommendation Integration + +### **Phase 4: Action Planning & Implementation (LONG TERM - 4-6 weeks)** +- Action Plan Generation +- Implementation Tracking +- Advanced Analytics + +### **Phase 5: Enterprise Features (FUTURE - 6+ weeks)** +- Advanced Governance +- Multi-Cluster Support + +## 🔄 Next Steps + +1. **Keep README.md updated** as single source of truth +2. **Update AIAgents-Support.md** as needed +3. **Maintain DOCUMENTATION.md** as documentation index + +## 📞 Support + +- **Main documentation**: [README.md](README.md) - **Issues**: [GitHub Issues](https://github.com/andersonid/openshift-resource-governance/issues) -- **Suporte IA**: [AIAgents-Support.md](AIAgents-Support.md) +- **AI Support**: [AIAgents-Support.md](AIAgents-Support.md) diff --git a/README.md b/README.md index 99ac8c9..5f6bff0 100644 --- a/README.md +++ b/README.md @@ -282,14 +282,165 @@ curl http://localhost:8080/health ## 📝 Roadmap -### Upcoming Versions -- [ ] VPA Integration and Health Monitoring -- [ ] PDF reports with charts -- [ ] Advanced filtering and search -- [ ] Alerting system (email, Slack) -- [ ] Multi-cluster support -- [ ] RBAC integration -- [ ] API documentation (OpenAPI/Swagger) +### 🎯 **PRAGMATIC ROADMAP - Resource Governance Focus** + +**Core Mission**: List projects without requests/limits + provide smart recommendations based on historical analysis + VPA integration + +--- + +### **Phase 1: Enhanced Validation & Categorization (IMMEDIATE - 1-2 weeks)** + +#### 1.1 Smart Resource Detection +- [ ] **Enhanced Validation Engine** + - Better categorization of resource issues (missing requests, missing limits, wrong ratios) + - Severity scoring based on impact and risk + - Workload age detection (new vs established) + +- [ ] **Workload Categorization System** + - **New Workloads** (< 7 days): Flag for VPA activation + - **Established Workloads** (> 7 days): Use historical analysis + - **Outlier Workloads**: High resource usage without proper limits + - **Compliant Workloads**: Properly configured resources + +#### 1.2 Historical Analysis Integration +- [ ] **Smart Historical Analysis** + - Use historical data to suggest realistic requests/limits + - Calculate P95/P99 percentiles for recommendations + - Identify seasonal patterns and trends + - Flag workloads with insufficient historical data + +--- + +### **Phase 2: Smart Recommendations Engine (SHORT TERM - 2-3 weeks)** + +#### 2.1 Recommendation Dashboard +- [ ] **Dedicated Recommendations Section** + - Replace generic "VPA Recommendations" with "Smart Recommendations" + - Show actionable insights with priority levels + - Display estimated impact of changes + - Group by namespace and severity + +#### 2.2 Recommendation Types +- [ ] **Resource Configuration Recommendations** + - "Add CPU requests: 200m (based on 7-day P95 usage)" + - "Increase memory limits: 512Mi (current usage peaks at 400Mi)" + - "Fix CPU ratio: 3:1 instead of 5:1 (current: 500m limit, 100m request)" + +- [ ] **VPA Activation Recommendations** + - "Activate VPA for new workload 'example' (insufficient historical data)" + - "Enable VPA for outlier workload 'high-cpu-app' (unpredictable usage patterns)" + +#### 2.3 Priority Scoring System +- [ ] **Impact-Based Prioritization** + - **Critical**: Missing limits on high-resource workloads + - **High**: Missing requests on production workloads + - **Medium**: Suboptimal ratios on established workloads + - **Low**: New workloads needing VPA activation + +--- + +### **Phase 3: VPA Integration & Automation (MEDIUM TERM - 3-4 weeks)** + +#### 3.1 VPA Detection & Management +- [ ] **VPA Status Detection** + - Detect existing VPAs in cluster + - Show VPA health and status + - Display current VPA recommendations + - Compare VPA suggestions with current settings + +#### 3.2 Smart VPA Activation +- [ ] **Automatic VPA Suggestions** + - Suggest VPA activation for new workloads (< 7 days) + - Recommend VPA for outlier workloads + - Provide VPA YAML configurations + - Show estimated benefits of VPA activation + +#### 3.3 VPA Recommendation Integration +- [ ] **VPA Data Integration** + - Fetch VPA recommendations from cluster + - Compare VPA suggestions with historical analysis + - Show confidence levels for recommendations + - Display VPA update modes and policies + +--- + +### **Phase 4: Action Planning & Implementation (LONG TERM - 4-6 weeks)** + +#### 4.1 Action Plan Generation +- [ ] **Step-by-Step Action Plans** + - Generate specific kubectl/oc commands + - Show before/after resource configurations + - Estimate implementation time and effort + - Provide rollback procedures + +#### 4.2 Implementation Tracking +- [ ] **Progress Monitoring** + - Track which recommendations have been implemented + - Show improvement metrics after changes + - Alert on new issues or regressions + - Generate implementation reports + +#### 4.3 Advanced Analytics +- [ ] **Cost Optimization Insights** + - Show potential cost savings from recommendations + - Identify over-provisioned resources + - Suggest right-sizing opportunities + - Display resource utilization trends + +--- + +### **Phase 5: Enterprise Features (FUTURE - 6+ weeks)** + +#### 5.1 Advanced Governance +- [ ] **Policy Enforcement** + - Custom resource policies per namespace + - Automated compliance checking + - Policy violation alerts + - Governance reporting + +#### 5.2 Multi-Cluster Support +- [ ] **Cross-Cluster Analysis** + - Compare resource usage across clusters + - Centralized recommendation management + - Cross-cluster best practices + - Unified reporting + +--- + +## 🎯 **IMMEDIATE NEXT STEPS (This Week)** + +### Priority 1: Enhanced Validation Engine +1. **Improve Resource Detection** + - Better categorization of missing requests/limits + - Add workload age detection + - Implement severity scoring + +2. **Smart Categorization** + - New workloads (< 7 days) → VPA candidates + - Established workloads (> 7 days) → Historical analysis + - Outlier workloads → Special attention needed + +### Priority 2: Recommendation Dashboard +1. **Create Recommendations Section** + - Replace generic VPA section + - Show actionable insights + - Display priority levels + +2. **Historical Analysis Integration** + - Use Prometheus data for recommendations + - Calculate realistic resource suggestions + - Show confidence levels + +### Priority 3: VPA Integration +1. **VPA Detection** + - Find existing VPAs in cluster + - Show VPA status and health + - Display current recommendations + +2. **Smart VPA Suggestions** + - Identify VPA candidates + - Generate VPA configurations + - Show estimated benefits ## 🤝 Contributing diff --git a/app/api/routes.py b/app/api/routes.py index 07357fc..28bba87 100644 --- a/app/api/routes.py +++ b/app/api/routes.py @@ -9,7 +9,7 @@ from fastapi.responses import FileResponse from app.models.resource_models import ( ClusterReport, NamespaceReport, ExportRequest, - ApplyRecommendationRequest + ApplyRecommendationRequest, WorkloadCategory, SmartRecommendation ) from app.services.validation_service import ValidationService from app.services.report_service import ReportService @@ -564,6 +564,173 @@ async def get_pod_historical_analysis( logger.error(f"Error getting historical analysis for pod {pod_name} in namespace {namespace}: {e}") raise HTTPException(status_code=500, detail=str(e)) +@api_router.get("/smart-recommendations") +async def get_smart_recommendations( + namespace: Optional[str] = None, + priority: Optional[str] = None, + k8s_client=Depends(get_k8s_client) +): + """Get smart recommendations for workloads""" + try: + # Collect pods + if namespace: + namespace_resources = await k8s_client.get_namespace_resources(namespace) + pods = namespace_resources.pods + else: + pods = await k8s_client.get_all_pods() + + # Get workload categories + categories = await validation_service.get_workload_categories(pods) + + # Get smart recommendations + recommendations = await validation_service.get_smart_recommendations(pods) + + # Filter by priority if specified + if priority: + recommendations = [ + r for r in recommendations if r.priority == priority + ] + + return { + "recommendations": recommendations, + "categories": categories, + "total": len(recommendations) + } + + except Exception as e: + logger.error(f"Error getting smart recommendations: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/workload-categories") +async def get_workload_categories( + namespace: Optional[str] = None, + k8s_client=Depends(get_k8s_client) +): + """Get workload categories analysis""" + try: + # Collect pods + if namespace: + namespace_resources = await k8s_client.get_namespace_resources(namespace) + pods = namespace_resources.pods + else: + pods = await k8s_client.get_all_pods() + + # Get workload categories + categories = await validation_service.get_workload_categories(pods) + + # Group by category + category_summary = {} + for category in categories: + cat_type = category.category + if cat_type not in category_summary: + category_summary[cat_type] = { + "count": 0, + "total_priority_score": 0, + "workloads": [] + } + + category_summary[cat_type]["count"] += 1 + category_summary[cat_type]["total_priority_score"] += category.priority_score + category_summary[cat_type]["workloads"].append({ + "name": category.workload_name, + "namespace": category.namespace, + "priority_score": category.priority_score, + "estimated_impact": category.estimated_impact, + "vpa_candidate": category.vpa_candidate + }) + + # Calculate average priority scores + for cat_type in category_summary: + if category_summary[cat_type]["count"] > 0: + category_summary[cat_type]["average_priority_score"] = ( + category_summary[cat_type]["total_priority_score"] / + category_summary[cat_type]["count"] + ) + + return { + "categories": category_summary, + "total_workloads": len(categories), + "timestamp": datetime.now().isoformat() + } + + except Exception as e: + logger.error(f"Error getting workload categories: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/validations/smart") +async def get_smart_validations( + namespace: Optional[str] = None, + severity: Optional[str] = None, + workload_category: Optional[str] = None, + page: int = 1, + page_size: int = 50, + k8s_client=Depends(get_k8s_client) +): + """Get validations with smart analysis and categorization""" + try: + # Collect pods + if namespace: + namespace_resources = await k8s_client.get_namespace_resources(namespace) + pods = namespace_resources.pods + else: + pods = await k8s_client.get_all_pods() + + # Get smart validations + all_validations = [] + for pod in pods: + pod_validations = await validation_service.validate_pod_resources_with_smart_analysis(pod) + all_validations.extend(pod_validations) + + # Filter by severity if specified + if severity: + all_validations = [ + v for v in all_validations if v.severity == severity + ] + + # Filter by workload category if specified + if workload_category: + all_validations = [ + v for v in all_validations if v.workload_category == workload_category + ] + + # Sort by priority score (descending) + all_validations.sort(key=lambda x: x.priority_score or 0, reverse=True) + + # Pagination + total = len(all_validations) + start = (page - 1) * page_size + end = start + page_size + paginated_validations = all_validations[start:end] + + return { + "validations": paginated_validations, + "pagination": { + "page": page, + "page_size": page_size, + "total": total, + "total_pages": (total + page_size - 1) // page_size + }, + "summary": { + "total_validations": total, + "by_severity": { + "critical": len([v for v in all_validations if v.severity == "critical"]), + "error": len([v for v in all_validations if v.severity == "error"]), + "warning": len([v for v in all_validations if v.severity == "warning"]), + "info": len([v for v in all_validations if v.severity == "info"]) + }, + "by_category": { + "new": len([v for v in all_validations if v.workload_category == "new"]), + "established": len([v for v in all_validations if v.workload_category == "established"]), + "outlier": len([v for v in all_validations if v.workload_category == "outlier"]), + "compliant": len([v for v in all_validations if v.workload_category == "compliant"]) + } + } + } + + except Exception as e: + logger.error(f"Error getting smart validations: {e}") + raise HTTPException(status_code=500, detail=str(e)) + @api_router.get("/health") async def health_check(): """API health check""" diff --git a/app/models/resource_models.py b/app/models/resource_models.py index 5e005b9..8c83c33 100644 --- a/app/models/resource_models.py +++ b/app/models/resource_models.py @@ -43,6 +43,9 @@ class ResourceValidation(BaseModel): severity: str # "warning", "error", "critical" message: str recommendation: Optional[str] = None + priority_score: Optional[int] = None # 1-10, higher = more critical + workload_category: Optional[str] = None # "new", "established", "outlier", "compliant" + estimated_impact: Optional[str] = None # "low", "medium", "high", "critical" class ClusterReport(BaseModel): """Cluster report""" @@ -80,3 +83,31 @@ class ApplyRecommendationRequest(BaseModel): action: str # "requests", "limits" value: str dry_run: bool = True + +class WorkloadCategory(BaseModel): + """Workload categorization""" + workload_name: str + namespace: str + category: str # "new", "established", "outlier", "compliant" + age_days: int + resource_config_status: str # "missing_requests", "missing_limits", "suboptimal_ratio", "compliant" + priority_score: int # 1-10 + estimated_impact: str # "low", "medium", "high", "critical" + vpa_candidate: bool = False + historical_data_available: bool = False + +class SmartRecommendation(BaseModel): + """Smart recommendation based on analysis""" + workload_name: str + namespace: str + recommendation_type: str # "resource_config", "vpa_activation", "ratio_adjustment" + priority: str # "critical", "high", "medium", "low" + title: str + description: str + current_config: Optional[Dict[str, str]] = None + suggested_config: Optional[Dict[str, str]] = None + confidence_level: Optional[float] = None # 0.0-1.0 + estimated_impact: Optional[str] = None + implementation_steps: Optional[List[str]] = None + kubectl_commands: Optional[List[str]] = None + vpa_yaml: Optional[str] = None diff --git a/app/services/smart_recommendations.py b/app/services/smart_recommendations.py new file mode 100644 index 0000000..11c19cf --- /dev/null +++ b/app/services/smart_recommendations.py @@ -0,0 +1,440 @@ +""" +Smart recommendations service for resource governance +""" +import logging +from typing import List, Dict, Any, Optional +from datetime import datetime, timedelta +from dataclasses import dataclass + +from app.models.resource_models import ( + PodResource, + WorkloadCategory, + SmartRecommendation, + ResourceValidation +) +from app.services.historical_analysis import HistoricalAnalysisService + +logger = logging.getLogger(__name__) + +@dataclass +class WorkloadAnalysis: + """Workload analysis data""" + workload_name: str + namespace: str + age_days: int + has_requests: bool + has_limits: bool + has_optimal_ratios: bool + resource_usage: Optional[Dict[str, float]] = None + historical_data_available: bool = False + +class SmartRecommendationsService: + """Service for generating smart recommendations""" + + def __init__(self): + self.historical_analysis = HistoricalAnalysisService() + self.new_workload_threshold_days = 7 + self.outlier_cpu_threshold = 0.8 # 80% CPU usage + self.outlier_memory_threshold = 0.8 # 80% Memory usage + + async def categorize_workloads(self, pods: List[PodResource]) -> List[WorkloadCategory]: + """Categorize workloads based on age and resource configuration""" + categories = [] + + # Group pods by workload (deployment) + workloads = self._group_pods_by_workload(pods) + + for workload_name, workload_pods in workloads.items(): + if not workload_pods: + continue + + # Analyze workload + analysis = await self._analyze_workload(workload_name, workload_pods) + + # Categorize workload + category = self._categorize_workload(analysis) + categories.append(category) + + return categories + + async def generate_smart_recommendations( + self, + pods: List[PodResource], + categories: List[WorkloadCategory] + ) -> List[SmartRecommendation]: + """Generate smart recommendations based on workload analysis""" + recommendations = [] + + for category in categories: + workload_pods = [p for p in pods if self._extract_workload_name(p.name) == category.workload_name and p.namespace == category.namespace] + + if not workload_pods: + continue + + # Generate recommendations based on category + workload_recommendations = await self._generate_workload_recommendations( + category, workload_pods + ) + recommendations.extend(workload_recommendations) + + # Sort by priority + recommendations.sort(key=lambda x: self._get_priority_score(x.priority), reverse=True) + + return recommendations + + def _group_pods_by_workload(self, pods: List[PodResource]) -> Dict[str, List[PodResource]]: + """Group pods by workload (deployment) name""" + workloads = {} + + for pod in pods: + workload_name = self._extract_workload_name(pod.name) + if workload_name not in workloads: + workloads[workload_name] = [] + workloads[workload_name].append(pod) + + return workloads + + def _extract_workload_name(self, pod_name: str) -> str: + """Extract workload name from pod name""" + # Remove replica set suffix (e.g., "app-74ffb8c66-9kpdg" -> "app") + parts = pod_name.split('-') + if len(parts) >= 3 and parts[-2].isalnum() and parts[-1].isalnum(): + return '-'.join(parts[:-2]) + return pod_name + + async def _analyze_workload(self, workload_name: str, pods: List[PodResource]) -> WorkloadAnalysis: + """Analyze a workload to determine its characteristics""" + if not pods: + return WorkloadAnalysis(workload_name, "", 0, False, False, False) + + # Get namespace from first pod + namespace = pods[0].namespace + + # Calculate age (use oldest pod) + oldest_pod = min(pods, key=lambda p: p.creation_timestamp if hasattr(p, 'creation_timestamp') else datetime.now()) + age_days = 0 + if hasattr(oldest_pod, 'creation_timestamp'): + age_days = (datetime.now() - oldest_pod.creation_timestamp).days + + # Analyze resource configuration + has_requests = all( + any(container.resources.get("requests") for container in pod.containers) + for pod in pods + ) + + has_limits = all( + any(container.resources.get("limits") for container in pod.containers) + for pod in pods + ) + + # Check for optimal ratios (simplified) + has_optimal_ratios = True + for pod in pods: + for container in pod.containers: + resources = container.resources + requests = resources.get("requests", {}) + limits = resources.get("limits", {}) + + if requests and limits: + # Check CPU ratio + if "cpu" in requests and "cpu" in limits: + try: + cpu_request = self._parse_cpu_value(requests["cpu"]) + cpu_limit = self._parse_cpu_value(limits["cpu"]) + if cpu_request > 0 and cpu_limit / cpu_request > 5.0: # > 5:1 ratio + has_optimal_ratios = False + except: + pass + + # Check memory ratio + if "memory" in requests and "memory" in limits: + try: + mem_request = self._parse_memory_value(requests["memory"]) + mem_limit = self._parse_memory_value(limits["memory"]) + if mem_request > 0 and mem_limit / mem_request > 5.0: # > 5:1 ratio + has_optimal_ratios = False + except: + pass + + # Check historical data availability + historical_data_available = False + try: + # Try to get historical data for the workload + historical_data = await self.historical_analysis.get_workload_historical_analysis( + namespace, workload_name, "7d" + ) + historical_data_available = not historical_data.get('error') + except: + pass + + return WorkloadAnalysis( + workload_name=workload_name, + namespace=namespace, + age_days=age_days, + has_requests=has_requests, + has_limits=has_limits, + has_optimal_ratios=has_optimal_ratios, + historical_data_available=historical_data_available + ) + + def _categorize_workload(self, analysis: WorkloadAnalysis) -> WorkloadCategory: + """Categorize workload based on analysis""" + # Determine category + if analysis.age_days < self.new_workload_threshold_days: + category = "new" + elif not analysis.has_requests or not analysis.has_limits: + category = "outlier" + elif not analysis.has_optimal_ratios: + category = "outlier" + else: + category = "compliant" + + # Determine resource config status + if not analysis.has_requests: + resource_status = "missing_requests" + elif not analysis.has_limits: + resource_status = "missing_limits" + elif not analysis.has_optimal_ratios: + resource_status = "suboptimal_ratio" + else: + resource_status = "compliant" + + # Calculate priority score + priority_score = self._calculate_priority_score(analysis, category, resource_status) + + # Determine estimated impact + estimated_impact = self._determine_impact(priority_score, category) + + # Determine if VPA candidate + vpa_candidate = ( + category == "new" or + (category == "outlier" and not analysis.historical_data_available) + ) + + return WorkloadCategory( + workload_name=analysis.workload_name, + namespace=analysis.namespace, + category=category, + age_days=analysis.age_days, + resource_config_status=resource_status, + priority_score=priority_score, + estimated_impact=estimated_impact, + vpa_candidate=vpa_candidate, + historical_data_available=analysis.historical_data_available + ) + + def _calculate_priority_score(self, analysis: WorkloadAnalysis, category: str, resource_status: str) -> int: + """Calculate priority score (1-10) for workload""" + score = 1 + + # Base score by category + if category == "outlier": + score += 4 + elif category == "new": + score += 2 + + # Add score by resource status + if resource_status == "missing_requests": + score += 3 + elif resource_status == "missing_limits": + score += 2 + elif resource_status == "suboptimal_ratio": + score += 1 + + # Add score for production namespaces + if analysis.namespace in ["default", "production", "prod"]: + score += 2 + + # Add score for age (older workloads are more critical) + if analysis.age_days > 30: + score += 1 + + return min(score, 10) + + def _determine_impact(self, priority_score: int, category: str) -> str: + """Determine estimated impact based on priority score and category""" + if priority_score >= 8: + return "critical" + elif priority_score >= 6: + return "high" + elif priority_score >= 4: + return "medium" + else: + return "low" + + async def _generate_workload_recommendations( + self, + category: WorkloadCategory, + pods: List[PodResource] + ) -> List[SmartRecommendation]: + """Generate recommendations for a specific workload""" + recommendations = [] + + if category.category == "new": + # New workload recommendations + recommendations.append(self._create_vpa_activation_recommendation(category)) + + elif category.category == "outlier": + if category.resource_config_status == "missing_requests": + recommendations.append(self._create_missing_requests_recommendation(category, pods)) + elif category.resource_config_status == "missing_limits": + recommendations.append(self._create_missing_limits_recommendation(category, pods)) + elif category.resource_config_status == "suboptimal_ratio": + recommendations.append(self._create_ratio_adjustment_recommendation(category, pods)) + + # Add VPA recommendation for outliers without historical data + if category.vpa_candidate and not category.historical_data_available: + recommendations.append(self._create_vpa_activation_recommendation(category)) + + return recommendations + + def _create_vpa_activation_recommendation(self, category: WorkloadCategory) -> SmartRecommendation: + """Create VPA activation recommendation""" + return SmartRecommendation( + workload_name=category.workload_name, + namespace=category.namespace, + recommendation_type="vpa_activation", + priority=category.estimated_impact, + title=f"Activate VPA for {category.workload_name}", + description=f"Enable VPA for {category.workload_name} to get automatic resource recommendations based on usage patterns.", + confidence_level=0.8 if category.historical_data_available else 0.6, + estimated_impact=category.estimated_impact, + implementation_steps=[ + f"Create VPA resource for {category.workload_name}", + "Set updateMode to 'Off' for recommendation-only mode", + "Monitor VPA recommendations for 24-48 hours", + "Apply recommended values when confident" + ], + kubectl_commands=[ + f"kubectl create -f vpa-{category.workload_name}.yaml" + ], + vpa_yaml=self._generate_vpa_yaml(category) + ) + + def _create_missing_requests_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation: + """Create missing requests recommendation""" + return SmartRecommendation( + workload_name=category.workload_name, + namespace=category.namespace, + recommendation_type="resource_config", + priority=category.estimated_impact, + title=f"Add Resource Requests for {category.workload_name}", + description=f"Define CPU and memory requests for {category.workload_name} to guarantee QoS and enable proper scheduling.", + confidence_level=0.9, + estimated_impact=category.estimated_impact, + implementation_steps=[ + f"Analyze current resource usage for {category.workload_name}", + "Set CPU requests based on P95 usage + 20% buffer", + "Set memory requests based on P95 usage + 20% buffer", + "Update deployment with new resource requests" + ], + kubectl_commands=[ + f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}}}}}}]}}}}}}}}'" + ] + ) + + def _create_missing_limits_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation: + """Create missing limits recommendation""" + return SmartRecommendation( + workload_name=category.workload_name, + namespace=category.namespace, + recommendation_type="resource_config", + priority=category.estimated_impact, + title=f"Add Resource Limits for {category.workload_name}", + description=f"Define CPU and memory limits for {category.workload_name} to prevent excessive resource consumption.", + confidence_level=0.9, + estimated_impact=category.estimated_impact, + implementation_steps=[ + f"Analyze current resource usage for {category.workload_name}", + "Set CPU limits based on P95 usage * 3 (3:1 ratio)", + "Set memory limits based on P95 usage * 3 (3:1 ratio)", + "Update deployment with new resource limits" + ], + kubectl_commands=[ + f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'" + ] + ) + + def _create_ratio_adjustment_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation: + """Create ratio adjustment recommendation""" + return SmartRecommendation( + workload_name=category.workload_name, + namespace=category.namespace, + recommendation_type="ratio_adjustment", + priority=category.estimated_impact, + title=f"Adjust Resource Ratios for {category.workload_name}", + description=f"Optimize CPU and memory limit:request ratios for {category.workload_name} to follow best practices (3:1 ratio).", + confidence_level=0.8, + estimated_impact=category.estimated_impact, + implementation_steps=[ + f"Analyze current resource ratios for {category.workload_name}", + "Adjust limits to maintain 3:1 ratio with requests", + "Test with updated ratios in staging environment", + "Apply changes to production" + ], + kubectl_commands=[ + f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}},\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'" + ] + ) + + def _generate_vpa_yaml(self, category: WorkloadCategory) -> str: + """Generate VPA YAML for workload""" + return f"""apiVersion: autoscaling.k8s.io/v1 +kind: VerticalPodAutoscaler +metadata: + name: {category.workload_name}-vpa + namespace: {category.namespace} +spec: + targetRef: + apiVersion: apps/v1 + kind: Deployment + name: {category.workload_name} + updatePolicy: + updateMode: "Off" # Recommendation only + resourcePolicy: + containerPolicies: + - containerName: {category.workload_name} + maxAllowed: + cpu: 2 + memory: 4Gi + minAllowed: + cpu: 100m + memory: 128Mi""" + + def _get_priority_score(self, priority: str) -> int: + """Convert priority string to numeric score for sorting""" + priority_map = { + "critical": 4, + "high": 3, + "medium": 2, + "low": 1 + } + return priority_map.get(priority, 0) + + def _parse_cpu_value(self, value: str) -> float: + """Convert CPU value to float (cores)""" + if value.endswith('m'): + return float(value[:-1]) / 1000 + elif value.endswith('n'): + return float(value[:-1]) / 1000000000 + else: + return float(value) + + def _parse_memory_value(self, value: str) -> int: + """Convert memory value to bytes""" + value = value.upper() + + if value.endswith('KI'): + return int(float(value[:-2]) * 1024) + elif value.endswith('MI'): + return int(float(value[:-2]) * 1024 * 1024) + elif value.endswith('GI'): + return int(float(value[:-2]) * 1024 * 1024 * 1024) + elif value.endswith('K'): + return int(float(value[:-1]) * 1000) + elif value.endswith('M'): + return int(float(value[:-1]) * 1000 * 1000) + elif value.endswith('G'): + return int(float(value[:-1]) * 1000 * 1000 * 1000) + else: + return int(value) diff --git a/app/services/validation_service.py b/app/services/validation_service.py index cca6696..c5ecda9 100644 --- a/app/services/validation_service.py +++ b/app/services/validation_service.py @@ -9,6 +9,7 @@ import re from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources from app.core.config import settings from app.services.historical_analysis import HistoricalAnalysisService +from app.services.smart_recommendations import SmartRecommendationsService logger = logging.getLogger(__name__) @@ -21,6 +22,7 @@ class ValidationService: self.min_cpu_request = settings.min_cpu_request self.min_memory_request = settings.min_memory_request self.historical_analysis = HistoricalAnalysisService() + self.smart_recommendations = SmartRecommendationsService() def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]: """Validate pod resources""" @@ -365,3 +367,124 @@ class ValidationService: ) return recommendations + + async def validate_pod_resources_with_categorization( + self, + pod: PodResource, + workload_category: str = None, + priority_score: int = None + ) -> List[ResourceValidation]: + """Validate pod resources with enhanced categorization and scoring""" + validations = self.validate_pod_resources(pod) + + # Add categorization and scoring to validations + for validation in validations: + validation.workload_category = workload_category + validation.priority_score = priority_score or self._calculate_priority_score(validation) + validation.estimated_impact = self._determine_impact(validation.priority_score) + + return validations + + async def validate_pod_resources_with_smart_analysis( + self, + pod: PodResource, + time_range: str = '24h' + ) -> List[ResourceValidation]: + """Validate pod resources with smart analysis including historical data""" + # Static validations + static_validations = self.validate_pod_resources(pod) + + # Get workload category + workload_category = await self._categorize_workload(pod) + + # Get smart recommendations + smart_recommendations = await self.smart_recommendations.generate_smart_recommendations([pod], [workload_category]) + + # Enhance validations with smart analysis + enhanced_validations = [] + for validation in static_validations: + validation.workload_category = workload_category.category + validation.priority_score = self._calculate_priority_score(validation) + validation.estimated_impact = self._determine_impact(validation.priority_score) + enhanced_validations.append(validation) + + # Add smart recommendations as validations + for recommendation in smart_recommendations: + smart_validation = ResourceValidation( + pod_name=pod.name, + namespace=pod.namespace, + container_name="workload", + validation_type="smart_recommendation", + severity=recommendation.priority, + message=recommendation.title, + recommendation=recommendation.description, + priority_score=self._get_priority_score_from_string(recommendation.priority), + workload_category=workload_category.category, + estimated_impact=recommendation.estimated_impact + ) + enhanced_validations.append(smart_validation) + + return enhanced_validations + + async def _categorize_workload(self, pod: PodResource) -> Any: + """Categorize a single workload""" + categories = await self.smart_recommendations.categorize_workloads([pod]) + return categories[0] if categories else None + + def _get_priority_score_from_string(self, priority: str) -> int: + """Convert priority string to numeric score""" + priority_map = { + "critical": 10, + "high": 8, + "medium": 5, + "low": 2 + } + return priority_map.get(priority, 5) + + def _calculate_priority_score(self, validation: ResourceValidation) -> int: + """Calculate priority score for validation (1-10)""" + score = 1 + + # Base score by severity + if validation.severity == "critical": + score += 4 + elif validation.severity == "error": + score += 3 + elif validation.severity == "warning": + score += 1 + + # Add score by validation type + if validation.validation_type == "missing_requests": + score += 3 + elif validation.validation_type == "missing_limits": + score += 2 + elif validation.validation_type == "invalid_ratio": + score += 1 + elif validation.validation_type == "overcommit": + score += 4 + + # Add score for production namespaces + if validation.namespace in ["default", "production", "prod"]: + score += 2 + + return min(score, 10) + + def _determine_impact(self, priority_score: int) -> str: + """Determine estimated impact based on priority score""" + if priority_score >= 8: + return "critical" + elif priority_score >= 6: + return "high" + elif priority_score >= 4: + return "medium" + else: + return "low" + + async def get_workload_categories(self, pods: List[PodResource]) -> List[Any]: + """Get workload categories for all pods""" + return await self.smart_recommendations.categorize_workloads(pods) + + async def get_smart_recommendations(self, pods: List[PodResource]) -> List[Any]: + """Get smart recommendations for all workloads""" + categories = await self.get_workload_categories(pods) + return await self.smart_recommendations.generate_smart_recommendations(pods, categories) diff --git a/app/static/index.html b/app/static/index.html index a0a2e0f..ba296db 100644 --- a/app/static/index.html +++ b/app/static/index.html @@ -802,6 +802,157 @@ width: auto; } + /* Smart Recommendations Styles */ + .validation-details { + display: flex; + gap: 1rem; + margin: 0.5rem 0; + flex-wrap: wrap; + } + + .detail-item { + font-size: 0.9rem; + color: #666; + } + + .implementation-steps { + margin: 1rem 0; + padding: 1rem; + background: #f8f9fa; + border-radius: 6px; + border-left: 4px solid #007bff; + } + + .implementation-steps ol { + margin: 0.5rem 0 0 1rem; + } + + .implementation-steps li { + margin: 0.25rem 0; + } + + .kubectl-commands { + margin: 1rem 0; + padding: 1rem; + background: #f8f9fa; + border-radius: 6px; + border-left: 4px solid #28a745; + } + + .kubectl-commands pre { + margin: 0.5rem 0 0 0; + background: #2d3748; + color: #e2e8f0; + padding: 0.75rem; + border-radius: 4px; + overflow-x: auto; + font-size: 0.85rem; + } + + .vpa-yaml { + margin: 1rem 0; + padding: 1rem; + background: #f8f9fa; + border-radius: 6px; + border-left: 4px solid #ffc107; + } + + .vpa-yaml pre { + margin: 0.5rem 0 0 0; + background: #2d3748; + color: #e2e8f0; + padding: 0.75rem; + border-radius: 4px; + overflow-x: auto; + font-size: 0.85rem; + } + + /* Workload Categories Styles */ + .workload-list { + padding: 1rem 0; + } + + .workload-item { + background: #f8f9fa; + border: 1px solid #e9ecef; + border-radius: 6px; + padding: 1rem; + margin-bottom: 0.75rem; + } + + .workload-header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 0.5rem; + } + + .workload-name { + font-weight: 600; + color: #cc0000; + font-size: 1.1rem; + } + + .workload-namespace { + color: #666; + font-size: 0.9rem; + } + + .workload-details { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); + gap: 0.75rem; + } + + .workload-stat { + font-size: 0.9rem; + } + + .badge { + padding: 0.25rem 0.5rem; + border-radius: 12px; + font-size: 0.8rem; + font-weight: bold; + text-transform: uppercase; + } + + .badge.success { + background: #d4edda; + color: #155724; + } + + .badge.info { + background: #d1ecf1; + color: #0c5460; + } + + .badge.warning { + background: #fff3cd; + color: #856404; + } + + .badge.error { + background: #f8d7da; + color: #721c24; + } + + .badge.critical { + background: #f8d7da; + color: #721c24; + font-weight: bold; + } + + /* Severity Info */ + .severity-info { + background: #d1ecf1; + color: #0c5460; + } + + .severity-badge.severity-info { + background: #d1ecf1; + color: #0c5460; + } + @media (max-width: 768px) { .container { padding: 1rem; @@ -849,9 +1000,17 @@ 📈 Historical Resource Usage - + 🎯 - VPA Recommendations + Smart Recommendations + + + 📊 + Workload Analysis + + + ⚙️ + VPA Management @@ -960,9 +1119,62 @@ - + + + + + + + @@ -1215,8 +1427,28 @@ throw new Error(`HTTP ${response.status}: ${response.statusText}`); } - const result = await response.json(); - showSuccess(`Report exported: ${result.filepath}`); + // Get filename from Content-Disposition header + const contentDisposition = response.headers.get('Content-Disposition'); + let filename = 'report.csv'; + if (contentDisposition) { + const filenameMatch = contentDisposition.match(/filename="(.+)"/); + if (filenameMatch) { + filename = filenameMatch[1]; + } + } + + // Download the file + const blob = await response.blob(); + const url = window.URL.createObjectURL(blob); + const a = document.createElement('a'); + a.href = url; + a.download = filename; + document.body.appendChild(a); + a.click(); + window.URL.revokeObjectURL(url); + document.body.removeChild(a); + + showSuccess(`Report exported: ${filename}`); } catch (error) { showError('Error exporting report: ' + error.message); @@ -2001,54 +2233,6 @@ document.getElementById('exportModal').classList.remove('show'); } - async function exportReport() { - const format = document.getElementById('exportFormat').value; - const namespaces = document.getElementById('exportNamespaces').value; - const includeVPA = document.getElementById('includeVPA').checked; - const includeValidations = document.getElementById('includeValidations').checked; - - try { - const response = await fetch('/api/v1/export', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - }, - body: JSON.stringify({ - format: format, - namespaces: namespaces ? namespaces.split(',').map(ns => ns.trim()) : null, - include_vpa: includeVPA, - include_validations: includeValidations - }) - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - // Get filename from response headers - const contentDisposition = response.headers.get('Content-Disposition'); - const filename = contentDisposition - ? contentDisposition.split('filename=')[1].replace(/"/g, '') - : `report.${format}`; - - // Download the file - const blob = await response.blob(); - const url = window.URL.createObjectURL(blob); - const a = document.createElement('a'); - a.href = url; - a.download = filename; - document.body.appendChild(a); - a.click(); - window.URL.revokeObjectURL(url); - document.body.removeChild(a); - - closeExportModal(); - showSuccess('Report exported successfully!'); - - } catch (error) { - showError('Error exporting report: ' + error.message); - } - } // Close export modal when clicking outside document.getElementById('exportModal').addEventListener('click', function(e) { @@ -2056,6 +2240,281 @@ closeExportModal(); } }); + + // Smart Recommendations Functions + async function loadSmartRecommendations() { + showLoading(); + + try { + const priority = document.getElementById('recommendationPriorityFilter').value; + const type = document.getElementById('recommendationTypeFilter').value; + + const params = new URLSearchParams(); + if (priority) params.append('priority', priority); + + const response = await fetch(`/api/v1/smart-recommendations?${params}`); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const data = await response.json(); + displaySmartRecommendations(data, type); + document.getElementById('smartRecommendationsCard').style.display = 'block'; + + } catch (error) { + showError('Error loading smart recommendations: ' + error.message); + } finally { + hideLoading(); + } + } + + function displaySmartRecommendations(data, typeFilter) { + const container = document.getElementById('smartRecommendationsList'); + + if (!data.recommendations || data.recommendations.length === 0) { + container.innerHTML = '

No smart recommendations found.

'; + return; + } + + let recommendations = data.recommendations; + + // Filter by type if specified + if (typeFilter) { + recommendations = recommendations.filter(r => r.recommendation_type === typeFilter); + } + + if (recommendations.length === 0) { + container.innerHTML = '

No recommendations match the selected filters.

'; + return; + } + + let html = ''; + + recommendations.forEach(rec => { + const priorityClass = `severity-${rec.priority}`; + const confidenceLevel = rec.confidence_level ? `${(rec.confidence_level * 100).toFixed(0)}%` : 'N/A'; + + html += ` +
+
+ ${rec.priority} + ${rec.title} + ${rec.recommendation_type} +
+
+ Workload: ${rec.workload_name} (${rec.namespace}) +
+
+ Description: ${rec.description} +
+
+
+ Confidence: ${confidenceLevel} +
+
+ Impact: ${rec.estimated_impact || 'N/A'} +
+
+ `; + + if (rec.implementation_steps && rec.implementation_steps.length > 0) { + html += ` +
+ Implementation Steps: +
    + ${rec.implementation_steps.map(step => `
  1. ${step}
  2. `).join('')} +
+
+ `; + } + + if (rec.kubectl_commands && rec.kubectl_commands.length > 0) { + html += ` +
+ Kubectl Commands: +
${rec.kubectl_commands.join('\n')}
+
+ `; + } + + if (rec.vpa_yaml) { + html += ` +
+ VPA Configuration: +
${rec.vpa_yaml}
+
+ `; + } + + html += '
'; + }); + + container.innerHTML = html; + } + + // Workload Categories Functions + async function loadWorkloadCategories() { + showLoading(); + + try { + const category = document.getElementById('categoryFilter').value; + + const params = new URLSearchParams(); + if (category) params.append('category', category); + + const response = await fetch(`/api/v1/workload-categories?${params}`); + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const data = await response.json(); + displayWorkloadCategories(data); + document.getElementById('workloadCategoriesCard').style.display = 'block'; + + } catch (error) { + showError('Error loading workload categories: ' + error.message); + } finally { + hideLoading(); + } + } + + function displayWorkloadCategories(data) { + const container = document.getElementById('workloadCategoriesList'); + + if (!data.categories || Object.keys(data.categories).length === 0) { + container.innerHTML = '

No workload categories found.

'; + return; + } + + let html = ` +
+
+
${data.total_workloads}
+
Total Workloads
+
+
+ `; + + Object.keys(data.categories).forEach(categoryType => { + const category = data.categories[categoryType]; + const categoryClass = categoryType === 'outlier' ? 'error' : + categoryType === 'new' ? 'warning' : + categoryType === 'compliant' ? 'success' : 'info'; + + html += ` +
+
+
+ ${categoryType} + ${category.count} workloads +
+
+
Avg Priority: ${category.average_priority_score?.toFixed(1) || 'N/A'}
+
VPA Candidates: ${category.workloads.filter(w => w.vpa_candidate).length}
+
+
+
+
+
+ `; + + category.workloads.forEach(workload => { + const impactClass = workload.estimated_impact === 'critical' ? 'critical' : + workload.estimated_impact === 'high' ? 'error' : + workload.estimated_impact === 'medium' ? 'warning' : 'info'; + + html += ` +
+
+
${workload.name}
+
${workload.namespace}
+
+
+
+ Priority Score: ${workload.priority_score}/10 +
+
+ Impact: + ${workload.estimated_impact} +
+
+ VPA Candidate: + ${workload.vpa_candidate ? '✅ Yes' : '❌ No'} +
+
+
+ `; + }); + + html += ` +
+
+
+ `; + }); + + container.innerHTML = html; + } + + // Navigation Functions + function showSection(sectionName) { + // Hide all sections + document.querySelectorAll('.card').forEach(card => { + card.style.display = 'none'; + }); + + // Remove active class from all nav items + document.querySelectorAll('.nav-item').forEach(item => { + item.classList.remove('active'); + }); + + // Show selected section + const sectionMap = { + 'dashboard': 'validationsCard', + 'historical-analysis': 'historicalCard', + 'smart-recommendations': 'smartRecommendationsCard', + 'workload-categories': 'workloadCategoriesCard', + 'vpa-recommendations': 'vpaCard' + }; + + const cardId = sectionMap[sectionName]; + if (cardId) { + document.getElementById(cardId).style.display = 'block'; + } + + // Add active class to clicked nav item + document.querySelector(`[data-section="${sectionName}"]`).classList.add('active'); + + // Load data for the section + switch(sectionName) { + case 'dashboard': + loadValidationsByNamespace(); + break; + case 'historical-analysis': + loadHistoricalValidations(); + break; + case 'smart-recommendations': + loadSmartRecommendations(); + break; + case 'workload-categories': + loadWorkloadCategories(); + break; + case 'vpa-recommendations': + loadVPARecommendations(); + break; + } + } + + // Add click handlers for navigation + document.addEventListener('DOMContentLoaded', function() { + document.querySelectorAll('.nav-item').forEach(item => { + item.addEventListener('click', function(e) { + e.preventDefault(); + const section = this.getAttribute('data-section'); + showSection(section); + }); + }); + });