Feat: implementar sistema de recomendações inteligentes e categorização de workloads

2025-09-29 15:26:09 -03:00
parent 63a284f4b2
commit afc7462b40
7 changed files with 1491 additions and 91 deletions
--- a/app/services/smart_recommendations.py
+++ b/app/services/smart_recommendations.py
@@ -0,0 +1,440 @@
+"""
+Smart recommendations service for resource governance
+"""
+import logging
+from typing import List, Dict, Any, Optional
+from datetime import datetime, timedelta
+from dataclasses import dataclass
+
+from app.models.resource_models import (
+    PodResource, 
+    WorkloadCategory, 
+    SmartRecommendation,
+    ResourceValidation
+)
+from app.services.historical_analysis import HistoricalAnalysisService
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class WorkloadAnalysis:
+    """Workload analysis data"""
+    workload_name: str
+    namespace: str
+    age_days: int
+    has_requests: bool
+    has_limits: bool
+    has_optimal_ratios: bool
+    resource_usage: Optional[Dict[str, float]] = None
+    historical_data_available: bool = False
+
+class SmartRecommendationsService:
+    """Service for generating smart recommendations"""
+    
+    def __init__(self):
+        self.historical_analysis = HistoricalAnalysisService()
+        self.new_workload_threshold_days = 7
+        self.outlier_cpu_threshold = 0.8  # 80% CPU usage
+        self.outlier_memory_threshold = 0.8  # 80% Memory usage
+    
+    async def categorize_workloads(self, pods: List[PodResource]) -> List[WorkloadCategory]:
+        """Categorize workloads based on age and resource configuration"""
+        categories = []
+        
+        # Group pods by workload (deployment)
+        workloads = self._group_pods_by_workload(pods)
+        
+        for workload_name, workload_pods in workloads.items():
+            if not workload_pods:
+                continue
+                
+            # Analyze workload
+            analysis = await self._analyze_workload(workload_name, workload_pods)
+            
+            # Categorize workload
+            category = self._categorize_workload(analysis)
+            categories.append(category)
+        
+        return categories
+    
+    async def generate_smart_recommendations(
+        self, 
+        pods: List[PodResource],
+        categories: List[WorkloadCategory]
+    ) -> List[SmartRecommendation]:
+        """Generate smart recommendations based on workload analysis"""
+        recommendations = []
+        
+        for category in categories:
+            workload_pods = [p for p in pods if self._extract_workload_name(p.name) == category.workload_name and p.namespace == category.namespace]
+            
+            if not workload_pods:
+                continue
+            
+            # Generate recommendations based on category
+            workload_recommendations = await self._generate_workload_recommendations(
+                category, workload_pods
+            )
+            recommendations.extend(workload_recommendations)
+        
+        # Sort by priority
+        recommendations.sort(key=lambda x: self._get_priority_score(x.priority), reverse=True)
+        
+        return recommendations
+    
+    def _group_pods_by_workload(self, pods: List[PodResource]) -> Dict[str, List[PodResource]]:
+        """Group pods by workload (deployment) name"""
+        workloads = {}
+        
+        for pod in pods:
+            workload_name = self._extract_workload_name(pod.name)
+            if workload_name not in workloads:
+                workloads[workload_name] = []
+            workloads[workload_name].append(pod)
+        
+        return workloads
+    
+    def _extract_workload_name(self, pod_name: str) -> str:
+        """Extract workload name from pod name"""
+        # Remove replica set suffix (e.g., "app-74ffb8c66-9kpdg" -> "app")
+        parts = pod_name.split('-')
+        if len(parts) >= 3 and parts[-2].isalnum() and parts[-1].isalnum():
+            return '-'.join(parts[:-2])
+        return pod_name
+    
+    async def _analyze_workload(self, workload_name: str, pods: List[PodResource]) -> WorkloadAnalysis:
+        """Analyze a workload to determine its characteristics"""
+        if not pods:
+            return WorkloadAnalysis(workload_name, "", 0, False, False, False)
+        
+        # Get namespace from first pod
+        namespace = pods[0].namespace
+        
+        # Calculate age (use oldest pod)
+        oldest_pod = min(pods, key=lambda p: p.creation_timestamp if hasattr(p, 'creation_timestamp') else datetime.now())
+        age_days = 0
+        if hasattr(oldest_pod, 'creation_timestamp'):
+            age_days = (datetime.now() - oldest_pod.creation_timestamp).days
+        
+        # Analyze resource configuration
+        has_requests = all(
+            any(container.resources.get("requests") for container in pod.containers)
+            for pod in pods
+        )
+        
+        has_limits = all(
+            any(container.resources.get("limits") for container in pod.containers)
+            for pod in pods
+        )
+        
+        # Check for optimal ratios (simplified)
+        has_optimal_ratios = True
+        for pod in pods:
+            for container in pod.containers:
+                resources = container.resources
+                requests = resources.get("requests", {})
+                limits = resources.get("limits", {})
+                
+                if requests and limits:
+                    # Check CPU ratio
+                    if "cpu" in requests and "cpu" in limits:
+                        try:
+                            cpu_request = self._parse_cpu_value(requests["cpu"])
+                            cpu_limit = self._parse_cpu_value(limits["cpu"])
+                            if cpu_request > 0 and cpu_limit / cpu_request > 5.0:  # > 5:1 ratio
+                                has_optimal_ratios = False
+                        except:
+                            pass
+                    
+                    # Check memory ratio
+                    if "memory" in requests and "memory" in limits:
+                        try:
+                            mem_request = self._parse_memory_value(requests["memory"])
+                            mem_limit = self._parse_memory_value(limits["memory"])
+                            if mem_request > 0 and mem_limit / mem_request > 5.0:  # > 5:1 ratio
+                                has_optimal_ratios = False
+                        except:
+                            pass
+        
+        # Check historical data availability
+        historical_data_available = False
+        try:
+            # Try to get historical data for the workload
+            historical_data = await self.historical_analysis.get_workload_historical_analysis(
+                namespace, workload_name, "7d"
+            )
+            historical_data_available = not historical_data.get('error')
+        except:
+            pass
+        
+        return WorkloadAnalysis(
+            workload_name=workload_name,
+            namespace=namespace,
+            age_days=age_days,
+            has_requests=has_requests,
+            has_limits=has_limits,
+            has_optimal_ratios=has_optimal_ratios,
+            historical_data_available=historical_data_available
+        )
+    
+    def _categorize_workload(self, analysis: WorkloadAnalysis) -> WorkloadCategory:
+        """Categorize workload based on analysis"""
+        # Determine category
+        if analysis.age_days < self.new_workload_threshold_days:
+            category = "new"
+        elif not analysis.has_requests or not analysis.has_limits:
+            category = "outlier"
+        elif not analysis.has_optimal_ratios:
+            category = "outlier"
+        else:
+            category = "compliant"
+        
+        # Determine resource config status
+        if not analysis.has_requests:
+            resource_status = "missing_requests"
+        elif not analysis.has_limits:
+            resource_status = "missing_limits"
+        elif not analysis.has_optimal_ratios:
+            resource_status = "suboptimal_ratio"
+        else:
+            resource_status = "compliant"
+        
+        # Calculate priority score
+        priority_score = self._calculate_priority_score(analysis, category, resource_status)
+        
+        # Determine estimated impact
+        estimated_impact = self._determine_impact(priority_score, category)
+        
+        # Determine if VPA candidate
+        vpa_candidate = (
+            category == "new" or 
+            (category == "outlier" and not analysis.historical_data_available)
+        )
+        
+        return WorkloadCategory(
+            workload_name=analysis.workload_name,
+            namespace=analysis.namespace,
+            category=category,
+            age_days=analysis.age_days,
+            resource_config_status=resource_status,
+            priority_score=priority_score,
+            estimated_impact=estimated_impact,
+            vpa_candidate=vpa_candidate,
+            historical_data_available=analysis.historical_data_available
+        )
+    
+    def _calculate_priority_score(self, analysis: WorkloadAnalysis, category: str, resource_status: str) -> int:
+        """Calculate priority score (1-10) for workload"""
+        score = 1
+        
+        # Base score by category
+        if category == "outlier":
+            score += 4
+        elif category == "new":
+            score += 2
+        
+        # Add score by resource status
+        if resource_status == "missing_requests":
+            score += 3
+        elif resource_status == "missing_limits":
+            score += 2
+        elif resource_status == "suboptimal_ratio":
+            score += 1
+        
+        # Add score for production namespaces
+        if analysis.namespace in ["default", "production", "prod"]:
+            score += 2
+        
+        # Add score for age (older workloads are more critical)
+        if analysis.age_days > 30:
+            score += 1
+        
+        return min(score, 10)
+    
+    def _determine_impact(self, priority_score: int, category: str) -> str:
+        """Determine estimated impact based on priority score and category"""
+        if priority_score >= 8:
+            return "critical"
+        elif priority_score >= 6:
+            return "high"
+        elif priority_score >= 4:
+            return "medium"
+        else:
+            return "low"
+    
+    async def _generate_workload_recommendations(
+        self, 
+        category: WorkloadCategory, 
+        pods: List[PodResource]
+    ) -> List[SmartRecommendation]:
+        """Generate recommendations for a specific workload"""
+        recommendations = []
+        
+        if category.category == "new":
+            # New workload recommendations
+            recommendations.append(self._create_vpa_activation_recommendation(category))
+        
+        elif category.category == "outlier":
+            if category.resource_config_status == "missing_requests":
+                recommendations.append(self._create_missing_requests_recommendation(category, pods))
+            elif category.resource_config_status == "missing_limits":
+                recommendations.append(self._create_missing_limits_recommendation(category, pods))
+            elif category.resource_config_status == "suboptimal_ratio":
+                recommendations.append(self._create_ratio_adjustment_recommendation(category, pods))
+        
+        # Add VPA recommendation for outliers without historical data
+        if category.vpa_candidate and not category.historical_data_available:
+            recommendations.append(self._create_vpa_activation_recommendation(category))
+        
+        return recommendations
+    
+    def _create_vpa_activation_recommendation(self, category: WorkloadCategory) -> SmartRecommendation:
+        """Create VPA activation recommendation"""
+        return SmartRecommendation(
+            workload_name=category.workload_name,
+            namespace=category.namespace,
+            recommendation_type="vpa_activation",
+            priority=category.estimated_impact,
+            title=f"Activate VPA for {category.workload_name}",
+            description=f"Enable VPA for {category.workload_name} to get automatic resource recommendations based on usage patterns.",
+            confidence_level=0.8 if category.historical_data_available else 0.6,
+            estimated_impact=category.estimated_impact,
+            implementation_steps=[
+                f"Create VPA resource for {category.workload_name}",
+                "Set updateMode to 'Off' for recommendation-only mode",
+                "Monitor VPA recommendations for 24-48 hours",
+                "Apply recommended values when confident"
+            ],
+            kubectl_commands=[
+                f"kubectl create -f vpa-{category.workload_name}.yaml"
+            ],
+            vpa_yaml=self._generate_vpa_yaml(category)
+        )
+    
+    def _create_missing_requests_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
+        """Create missing requests recommendation"""
+        return SmartRecommendation(
+            workload_name=category.workload_name,
+            namespace=category.namespace,
+            recommendation_type="resource_config",
+            priority=category.estimated_impact,
+            title=f"Add Resource Requests for {category.workload_name}",
+            description=f"Define CPU and memory requests for {category.workload_name} to guarantee QoS and enable proper scheduling.",
+            confidence_level=0.9,
+            estimated_impact=category.estimated_impact,
+            implementation_steps=[
+                f"Analyze current resource usage for {category.workload_name}",
+                "Set CPU requests based on P95 usage + 20% buffer",
+                "Set memory requests based on P95 usage + 20% buffer",
+                "Update deployment with new resource requests"
+            ],
+            kubectl_commands=[
+                f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}}}}}}]}}}}}}}}'"
+            ]
+        )
+    
+    def _create_missing_limits_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
+        """Create missing limits recommendation"""
+        return SmartRecommendation(
+            workload_name=category.workload_name,
+            namespace=category.namespace,
+            recommendation_type="resource_config",
+            priority=category.estimated_impact,
+            title=f"Add Resource Limits for {category.workload_name}",
+            description=f"Define CPU and memory limits for {category.workload_name} to prevent excessive resource consumption.",
+            confidence_level=0.9,
+            estimated_impact=category.estimated_impact,
+            implementation_steps=[
+                f"Analyze current resource usage for {category.workload_name}",
+                "Set CPU limits based on P95 usage * 3 (3:1 ratio)",
+                "Set memory limits based on P95 usage * 3 (3:1 ratio)",
+                "Update deployment with new resource limits"
+            ],
+            kubectl_commands=[
+                f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'"
+            ]
+        )
+    
+    def _create_ratio_adjustment_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
+        """Create ratio adjustment recommendation"""
+        return SmartRecommendation(
+            workload_name=category.workload_name,
+            namespace=category.namespace,
+            recommendation_type="ratio_adjustment",
+            priority=category.estimated_impact,
+            title=f"Adjust Resource Ratios for {category.workload_name}",
+            description=f"Optimize CPU and memory limit:request ratios for {category.workload_name} to follow best practices (3:1 ratio).",
+            confidence_level=0.8,
+            estimated_impact=category.estimated_impact,
+            implementation_steps=[
+                f"Analyze current resource ratios for {category.workload_name}",
+                "Adjust limits to maintain 3:1 ratio with requests",
+                "Test with updated ratios in staging environment",
+                "Apply changes to production"
+            ],
+            kubectl_commands=[
+                f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}},\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'"
+            ]
+        )
+    
+    def _generate_vpa_yaml(self, category: WorkloadCategory) -> str:
+        """Generate VPA YAML for workload"""
+        return f"""apiVersion: autoscaling.k8s.io/v1
+kind: VerticalPodAutoscaler
+metadata:
+  name: {category.workload_name}-vpa
+  namespace: {category.namespace}
+spec:
+  targetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {category.workload_name}
+  updatePolicy:
+    updateMode: "Off"  # Recommendation only
+  resourcePolicy:
+    containerPolicies:
+    - containerName: {category.workload_name}
+      maxAllowed:
+        cpu: 2
+        memory: 4Gi
+      minAllowed:
+        cpu: 100m
+        memory: 128Mi"""
+    
+    def _get_priority_score(self, priority: str) -> int:
+        """Convert priority string to numeric score for sorting"""
+        priority_map = {
+            "critical": 4,
+            "high": 3,
+            "medium": 2,
+            "low": 1
+        }
+        return priority_map.get(priority, 0)
+    
+    def _parse_cpu_value(self, value: str) -> float:
+        """Convert CPU value to float (cores)"""
+        if value.endswith('m'):
+            return float(value[:-1]) / 1000
+        elif value.endswith('n'):
+            return float(value[:-1]) / 1000000000
+        else:
+            return float(value)
+    
+    def _parse_memory_value(self, value: str) -> int:
+        """Convert memory value to bytes"""
+        value = value.upper()
+        
+        if value.endswith('KI'):
+            return int(float(value[:-2]) * 1024)
+        elif value.endswith('MI'):
+            return int(float(value[:-2]) * 1024 * 1024)
+        elif value.endswith('GI'):
+            return int(float(value[:-2]) * 1024 * 1024 * 1024)
+        elif value.endswith('K'):
+            return int(float(value[:-1]) * 1000)
+        elif value.endswith('M'):
+            return int(float(value[:-1]) * 1000 * 1000)
+        elif value.endswith('G'):
+            return int(float(value[:-1]) * 1000 * 1000 * 1000)
+        else:
+            return int(value)
--- a/app/services/validation_service.py
+++ b/app/services/validation_service.py
@@ -9,6 +9,7 @@ import re
 from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources
 from app.core.config import settings
 from app.services.historical_analysis import HistoricalAnalysisService
+from app.services.smart_recommendations import SmartRecommendationsService

 logger = logging.getLogger(__name__)

@@ -21,6 +22,7 @@ class ValidationService:
        self.min_cpu_request = settings.min_cpu_request
        self.min_memory_request = settings.min_memory_request
        self.historical_analysis = HistoricalAnalysisService()
+        self.smart_recommendations = SmartRecommendationsService()
    
    def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]:
        """Validate pod resources"""
@@ -365,3 +367,124 @@ class ValidationService:
            )
        
        return recommendations
+    
+    async def validate_pod_resources_with_categorization(
+        self, 
+        pod: PodResource, 
+        workload_category: str = None,
+        priority_score: int = None
+    ) -> List[ResourceValidation]:
+        """Validate pod resources with enhanced categorization and scoring"""
+        validations = self.validate_pod_resources(pod)
+        
+        # Add categorization and scoring to validations
+        for validation in validations:
+            validation.workload_category = workload_category
+            validation.priority_score = priority_score or self._calculate_priority_score(validation)
+            validation.estimated_impact = self._determine_impact(validation.priority_score)
+        
+        return validations
+    
+    async def validate_pod_resources_with_smart_analysis(
+        self, 
+        pod: PodResource, 
+        time_range: str = '24h'
+    ) -> List[ResourceValidation]:
+        """Validate pod resources with smart analysis including historical data"""
+        # Static validations
+        static_validations = self.validate_pod_resources(pod)
+        
+        # Get workload category
+        workload_category = await self._categorize_workload(pod)
+        
+        # Get smart recommendations
+        smart_recommendations = await self.smart_recommendations.generate_smart_recommendations([pod], [workload_category])
+        
+        # Enhance validations with smart analysis
+        enhanced_validations = []
+        for validation in static_validations:
+            validation.workload_category = workload_category.category
+            validation.priority_score = self._calculate_priority_score(validation)
+            validation.estimated_impact = self._determine_impact(validation.priority_score)
+            enhanced_validations.append(validation)
+        
+        # Add smart recommendations as validations
+        for recommendation in smart_recommendations:
+            smart_validation = ResourceValidation(
+                pod_name=pod.name,
+                namespace=pod.namespace,
+                container_name="workload",
+                validation_type="smart_recommendation",
+                severity=recommendation.priority,
+                message=recommendation.title,
+                recommendation=recommendation.description,
+                priority_score=self._get_priority_score_from_string(recommendation.priority),
+                workload_category=workload_category.category,
+                estimated_impact=recommendation.estimated_impact
+            )
+            enhanced_validations.append(smart_validation)
+        
+        return enhanced_validations
+    
+    async def _categorize_workload(self, pod: PodResource) -> Any:
+        """Categorize a single workload"""
+        categories = await self.smart_recommendations.categorize_workloads([pod])
+        return categories[0] if categories else None
+    
+    def _get_priority_score_from_string(self, priority: str) -> int:
+        """Convert priority string to numeric score"""
+        priority_map = {
+            "critical": 10,
+            "high": 8,
+            "medium": 5,
+            "low": 2
+        }
+        return priority_map.get(priority, 5)
+    
+    def _calculate_priority_score(self, validation: ResourceValidation) -> int:
+        """Calculate priority score for validation (1-10)"""
+        score = 1
+        
+        # Base score by severity
+        if validation.severity == "critical":
+            score += 4
+        elif validation.severity == "error":
+            score += 3
+        elif validation.severity == "warning":
+            score += 1
+        
+        # Add score by validation type
+        if validation.validation_type == "missing_requests":
+            score += 3
+        elif validation.validation_type == "missing_limits":
+            score += 2
+        elif validation.validation_type == "invalid_ratio":
+            score += 1
+        elif validation.validation_type == "overcommit":
+            score += 4
+        
+        # Add score for production namespaces
+        if validation.namespace in ["default", "production", "prod"]:
+            score += 2
+        
+        return min(score, 10)
+    
+    def _determine_impact(self, priority_score: int) -> str:
+        """Determine estimated impact based on priority score"""
+        if priority_score >= 8:
+            return "critical"
+        elif priority_score >= 6:
+            return "high"
+        elif priority_score >= 4:
+            return "medium"
+        else:
+            return "low"
+    
+    async def get_workload_categories(self, pods: List[PodResource]) -> List[Any]:
+        """Get workload categories for all pods"""
+        return await self.smart_recommendations.categorize_workloads(pods)
+    
+    async def get_smart_recommendations(self, pods: List[PodResource]) -> List[Any]:
+        """Get smart recommendations for all workloads"""
+        categories = await self.get_workload_categories(pods)
+        return await self.smart_recommendations.generate_smart_recommendations(pods, categories)