Feat: implementar sistema de recomendações inteligentes e categorização de workloads

This commit is contained in:
2025-09-29 15:26:09 -03:00
parent 63a284f4b2
commit afc7462b40
7 changed files with 1491 additions and 91 deletions

View File

@@ -0,0 +1,440 @@
"""
Smart recommendations service for resource governance
"""
import logging
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
from dataclasses import dataclass
from app.models.resource_models import (
PodResource,
WorkloadCategory,
SmartRecommendation,
ResourceValidation
)
from app.services.historical_analysis import HistoricalAnalysisService
logger = logging.getLogger(__name__)
@dataclass
class WorkloadAnalysis:
"""Workload analysis data"""
workload_name: str
namespace: str
age_days: int
has_requests: bool
has_limits: bool
has_optimal_ratios: bool
resource_usage: Optional[Dict[str, float]] = None
historical_data_available: bool = False
class SmartRecommendationsService:
"""Service for generating smart recommendations"""
def __init__(self):
self.historical_analysis = HistoricalAnalysisService()
self.new_workload_threshold_days = 7
self.outlier_cpu_threshold = 0.8 # 80% CPU usage
self.outlier_memory_threshold = 0.8 # 80% Memory usage
async def categorize_workloads(self, pods: List[PodResource]) -> List[WorkloadCategory]:
"""Categorize workloads based on age and resource configuration"""
categories = []
# Group pods by workload (deployment)
workloads = self._group_pods_by_workload(pods)
for workload_name, workload_pods in workloads.items():
if not workload_pods:
continue
# Analyze workload
analysis = await self._analyze_workload(workload_name, workload_pods)
# Categorize workload
category = self._categorize_workload(analysis)
categories.append(category)
return categories
async def generate_smart_recommendations(
self,
pods: List[PodResource],
categories: List[WorkloadCategory]
) -> List[SmartRecommendation]:
"""Generate smart recommendations based on workload analysis"""
recommendations = []
for category in categories:
workload_pods = [p for p in pods if self._extract_workload_name(p.name) == category.workload_name and p.namespace == category.namespace]
if not workload_pods:
continue
# Generate recommendations based on category
workload_recommendations = await self._generate_workload_recommendations(
category, workload_pods
)
recommendations.extend(workload_recommendations)
# Sort by priority
recommendations.sort(key=lambda x: self._get_priority_score(x.priority), reverse=True)
return recommendations
def _group_pods_by_workload(self, pods: List[PodResource]) -> Dict[str, List[PodResource]]:
"""Group pods by workload (deployment) name"""
workloads = {}
for pod in pods:
workload_name = self._extract_workload_name(pod.name)
if workload_name not in workloads:
workloads[workload_name] = []
workloads[workload_name].append(pod)
return workloads
def _extract_workload_name(self, pod_name: str) -> str:
"""Extract workload name from pod name"""
# Remove replica set suffix (e.g., "app-74ffb8c66-9kpdg" -> "app")
parts = pod_name.split('-')
if len(parts) >= 3 and parts[-2].isalnum() and parts[-1].isalnum():
return '-'.join(parts[:-2])
return pod_name
async def _analyze_workload(self, workload_name: str, pods: List[PodResource]) -> WorkloadAnalysis:
"""Analyze a workload to determine its characteristics"""
if not pods:
return WorkloadAnalysis(workload_name, "", 0, False, False, False)
# Get namespace from first pod
namespace = pods[0].namespace
# Calculate age (use oldest pod)
oldest_pod = min(pods, key=lambda p: p.creation_timestamp if hasattr(p, 'creation_timestamp') else datetime.now())
age_days = 0
if hasattr(oldest_pod, 'creation_timestamp'):
age_days = (datetime.now() - oldest_pod.creation_timestamp).days
# Analyze resource configuration
has_requests = all(
any(container.resources.get("requests") for container in pod.containers)
for pod in pods
)
has_limits = all(
any(container.resources.get("limits") for container in pod.containers)
for pod in pods
)
# Check for optimal ratios (simplified)
has_optimal_ratios = True
for pod in pods:
for container in pod.containers:
resources = container.resources
requests = resources.get("requests", {})
limits = resources.get("limits", {})
if requests and limits:
# Check CPU ratio
if "cpu" in requests and "cpu" in limits:
try:
cpu_request = self._parse_cpu_value(requests["cpu"])
cpu_limit = self._parse_cpu_value(limits["cpu"])
if cpu_request > 0 and cpu_limit / cpu_request > 5.0: # > 5:1 ratio
has_optimal_ratios = False
except:
pass
# Check memory ratio
if "memory" in requests and "memory" in limits:
try:
mem_request = self._parse_memory_value(requests["memory"])
mem_limit = self._parse_memory_value(limits["memory"])
if mem_request > 0 and mem_limit / mem_request > 5.0: # > 5:1 ratio
has_optimal_ratios = False
except:
pass
# Check historical data availability
historical_data_available = False
try:
# Try to get historical data for the workload
historical_data = await self.historical_analysis.get_workload_historical_analysis(
namespace, workload_name, "7d"
)
historical_data_available = not historical_data.get('error')
except:
pass
return WorkloadAnalysis(
workload_name=workload_name,
namespace=namespace,
age_days=age_days,
has_requests=has_requests,
has_limits=has_limits,
has_optimal_ratios=has_optimal_ratios,
historical_data_available=historical_data_available
)
def _categorize_workload(self, analysis: WorkloadAnalysis) -> WorkloadCategory:
"""Categorize workload based on analysis"""
# Determine category
if analysis.age_days < self.new_workload_threshold_days:
category = "new"
elif not analysis.has_requests or not analysis.has_limits:
category = "outlier"
elif not analysis.has_optimal_ratios:
category = "outlier"
else:
category = "compliant"
# Determine resource config status
if not analysis.has_requests:
resource_status = "missing_requests"
elif not analysis.has_limits:
resource_status = "missing_limits"
elif not analysis.has_optimal_ratios:
resource_status = "suboptimal_ratio"
else:
resource_status = "compliant"
# Calculate priority score
priority_score = self._calculate_priority_score(analysis, category, resource_status)
# Determine estimated impact
estimated_impact = self._determine_impact(priority_score, category)
# Determine if VPA candidate
vpa_candidate = (
category == "new" or
(category == "outlier" and not analysis.historical_data_available)
)
return WorkloadCategory(
workload_name=analysis.workload_name,
namespace=analysis.namespace,
category=category,
age_days=analysis.age_days,
resource_config_status=resource_status,
priority_score=priority_score,
estimated_impact=estimated_impact,
vpa_candidate=vpa_candidate,
historical_data_available=analysis.historical_data_available
)
def _calculate_priority_score(self, analysis: WorkloadAnalysis, category: str, resource_status: str) -> int:
"""Calculate priority score (1-10) for workload"""
score = 1
# Base score by category
if category == "outlier":
score += 4
elif category == "new":
score += 2
# Add score by resource status
if resource_status == "missing_requests":
score += 3
elif resource_status == "missing_limits":
score += 2
elif resource_status == "suboptimal_ratio":
score += 1
# Add score for production namespaces
if analysis.namespace in ["default", "production", "prod"]:
score += 2
# Add score for age (older workloads are more critical)
if analysis.age_days > 30:
score += 1
return min(score, 10)
def _determine_impact(self, priority_score: int, category: str) -> str:
"""Determine estimated impact based on priority score and category"""
if priority_score >= 8:
return "critical"
elif priority_score >= 6:
return "high"
elif priority_score >= 4:
return "medium"
else:
return "low"
async def _generate_workload_recommendations(
self,
category: WorkloadCategory,
pods: List[PodResource]
) -> List[SmartRecommendation]:
"""Generate recommendations for a specific workload"""
recommendations = []
if category.category == "new":
# New workload recommendations
recommendations.append(self._create_vpa_activation_recommendation(category))
elif category.category == "outlier":
if category.resource_config_status == "missing_requests":
recommendations.append(self._create_missing_requests_recommendation(category, pods))
elif category.resource_config_status == "missing_limits":
recommendations.append(self._create_missing_limits_recommendation(category, pods))
elif category.resource_config_status == "suboptimal_ratio":
recommendations.append(self._create_ratio_adjustment_recommendation(category, pods))
# Add VPA recommendation for outliers without historical data
if category.vpa_candidate and not category.historical_data_available:
recommendations.append(self._create_vpa_activation_recommendation(category))
return recommendations
def _create_vpa_activation_recommendation(self, category: WorkloadCategory) -> SmartRecommendation:
"""Create VPA activation recommendation"""
return SmartRecommendation(
workload_name=category.workload_name,
namespace=category.namespace,
recommendation_type="vpa_activation",
priority=category.estimated_impact,
title=f"Activate VPA for {category.workload_name}",
description=f"Enable VPA for {category.workload_name} to get automatic resource recommendations based on usage patterns.",
confidence_level=0.8 if category.historical_data_available else 0.6,
estimated_impact=category.estimated_impact,
implementation_steps=[
f"Create VPA resource for {category.workload_name}",
"Set updateMode to 'Off' for recommendation-only mode",
"Monitor VPA recommendations for 24-48 hours",
"Apply recommended values when confident"
],
kubectl_commands=[
f"kubectl create -f vpa-{category.workload_name}.yaml"
],
vpa_yaml=self._generate_vpa_yaml(category)
)
def _create_missing_requests_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
"""Create missing requests recommendation"""
return SmartRecommendation(
workload_name=category.workload_name,
namespace=category.namespace,
recommendation_type="resource_config",
priority=category.estimated_impact,
title=f"Add Resource Requests for {category.workload_name}",
description=f"Define CPU and memory requests for {category.workload_name} to guarantee QoS and enable proper scheduling.",
confidence_level=0.9,
estimated_impact=category.estimated_impact,
implementation_steps=[
f"Analyze current resource usage for {category.workload_name}",
"Set CPU requests based on P95 usage + 20% buffer",
"Set memory requests based on P95 usage + 20% buffer",
"Update deployment with new resource requests"
],
kubectl_commands=[
f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}}}}}}]}}}}}}}}'"
]
)
def _create_missing_limits_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
"""Create missing limits recommendation"""
return SmartRecommendation(
workload_name=category.workload_name,
namespace=category.namespace,
recommendation_type="resource_config",
priority=category.estimated_impact,
title=f"Add Resource Limits for {category.workload_name}",
description=f"Define CPU and memory limits for {category.workload_name} to prevent excessive resource consumption.",
confidence_level=0.9,
estimated_impact=category.estimated_impact,
implementation_steps=[
f"Analyze current resource usage for {category.workload_name}",
"Set CPU limits based on P95 usage * 3 (3:1 ratio)",
"Set memory limits based on P95 usage * 3 (3:1 ratio)",
"Update deployment with new resource limits"
],
kubectl_commands=[
f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'"
]
)
def _create_ratio_adjustment_recommendation(self, category: WorkloadCategory, pods: List[PodResource]) -> SmartRecommendation:
"""Create ratio adjustment recommendation"""
return SmartRecommendation(
workload_name=category.workload_name,
namespace=category.namespace,
recommendation_type="ratio_adjustment",
priority=category.estimated_impact,
title=f"Adjust Resource Ratios for {category.workload_name}",
description=f"Optimize CPU and memory limit:request ratios for {category.workload_name} to follow best practices (3:1 ratio).",
confidence_level=0.8,
estimated_impact=category.estimated_impact,
implementation_steps=[
f"Analyze current resource ratios for {category.workload_name}",
"Adjust limits to maintain 3:1 ratio with requests",
"Test with updated ratios in staging environment",
"Apply changes to production"
],
kubectl_commands=[
f"kubectl patch deployment {category.workload_name} -n {category.namespace} -p '{{\"spec\":{{\"template\":{{\"spec\":{{\"containers\":[{{\"name\":\"{category.workload_name}\",\"resources\":{{\"requests\":{{\"cpu\":\"200m\",\"memory\":\"512Mi\"}},\"limits\":{{\"cpu\":\"600m\",\"memory\":\"1536Mi\"}}}}}}]}}}}}}}}'"
]
)
def _generate_vpa_yaml(self, category: WorkloadCategory) -> str:
"""Generate VPA YAML for workload"""
return f"""apiVersion: autoscaling.k8s.io/v1
kind: VerticalPodAutoscaler
metadata:
name: {category.workload_name}-vpa
namespace: {category.namespace}
spec:
targetRef:
apiVersion: apps/v1
kind: Deployment
name: {category.workload_name}
updatePolicy:
updateMode: "Off" # Recommendation only
resourcePolicy:
containerPolicies:
- containerName: {category.workload_name}
maxAllowed:
cpu: 2
memory: 4Gi
minAllowed:
cpu: 100m
memory: 128Mi"""
def _get_priority_score(self, priority: str) -> int:
"""Convert priority string to numeric score for sorting"""
priority_map = {
"critical": 4,
"high": 3,
"medium": 2,
"low": 1
}
return priority_map.get(priority, 0)
def _parse_cpu_value(self, value: str) -> float:
"""Convert CPU value to float (cores)"""
if value.endswith('m'):
return float(value[:-1]) / 1000
elif value.endswith('n'):
return float(value[:-1]) / 1000000000
else:
return float(value)
def _parse_memory_value(self, value: str) -> int:
"""Convert memory value to bytes"""
value = value.upper()
if value.endswith('KI'):
return int(float(value[:-2]) * 1024)
elif value.endswith('MI'):
return int(float(value[:-2]) * 1024 * 1024)
elif value.endswith('GI'):
return int(float(value[:-2]) * 1024 * 1024 * 1024)
elif value.endswith('K'):
return int(float(value[:-1]) * 1000)
elif value.endswith('M'):
return int(float(value[:-1]) * 1000 * 1000)
elif value.endswith('G'):
return int(float(value[:-1]) * 1000 * 1000 * 1000)
else:
return int(value)

View File

@@ -9,6 +9,7 @@ import re
from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources
from app.core.config import settings
from app.services.historical_analysis import HistoricalAnalysisService
from app.services.smart_recommendations import SmartRecommendationsService
logger = logging.getLogger(__name__)
@@ -21,6 +22,7 @@ class ValidationService:
self.min_cpu_request = settings.min_cpu_request
self.min_memory_request = settings.min_memory_request
self.historical_analysis = HistoricalAnalysisService()
self.smart_recommendations = SmartRecommendationsService()
def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]:
"""Validate pod resources"""
@@ -365,3 +367,124 @@ class ValidationService:
)
return recommendations
async def validate_pod_resources_with_categorization(
self,
pod: PodResource,
workload_category: str = None,
priority_score: int = None
) -> List[ResourceValidation]:
"""Validate pod resources with enhanced categorization and scoring"""
validations = self.validate_pod_resources(pod)
# Add categorization and scoring to validations
for validation in validations:
validation.workload_category = workload_category
validation.priority_score = priority_score or self._calculate_priority_score(validation)
validation.estimated_impact = self._determine_impact(validation.priority_score)
return validations
async def validate_pod_resources_with_smart_analysis(
self,
pod: PodResource,
time_range: str = '24h'
) -> List[ResourceValidation]:
"""Validate pod resources with smart analysis including historical data"""
# Static validations
static_validations = self.validate_pod_resources(pod)
# Get workload category
workload_category = await self._categorize_workload(pod)
# Get smart recommendations
smart_recommendations = await self.smart_recommendations.generate_smart_recommendations([pod], [workload_category])
# Enhance validations with smart analysis
enhanced_validations = []
for validation in static_validations:
validation.workload_category = workload_category.category
validation.priority_score = self._calculate_priority_score(validation)
validation.estimated_impact = self._determine_impact(validation.priority_score)
enhanced_validations.append(validation)
# Add smart recommendations as validations
for recommendation in smart_recommendations:
smart_validation = ResourceValidation(
pod_name=pod.name,
namespace=pod.namespace,
container_name="workload",
validation_type="smart_recommendation",
severity=recommendation.priority,
message=recommendation.title,
recommendation=recommendation.description,
priority_score=self._get_priority_score_from_string(recommendation.priority),
workload_category=workload_category.category,
estimated_impact=recommendation.estimated_impact
)
enhanced_validations.append(smart_validation)
return enhanced_validations
async def _categorize_workload(self, pod: PodResource) -> Any:
"""Categorize a single workload"""
categories = await self.smart_recommendations.categorize_workloads([pod])
return categories[0] if categories else None
def _get_priority_score_from_string(self, priority: str) -> int:
"""Convert priority string to numeric score"""
priority_map = {
"critical": 10,
"high": 8,
"medium": 5,
"low": 2
}
return priority_map.get(priority, 5)
def _calculate_priority_score(self, validation: ResourceValidation) -> int:
"""Calculate priority score for validation (1-10)"""
score = 1
# Base score by severity
if validation.severity == "critical":
score += 4
elif validation.severity == "error":
score += 3
elif validation.severity == "warning":
score += 1
# Add score by validation type
if validation.validation_type == "missing_requests":
score += 3
elif validation.validation_type == "missing_limits":
score += 2
elif validation.validation_type == "invalid_ratio":
score += 1
elif validation.validation_type == "overcommit":
score += 4
# Add score for production namespaces
if validation.namespace in ["default", "production", "prod"]:
score += 2
return min(score, 10)
def _determine_impact(self, priority_score: int) -> str:
"""Determine estimated impact based on priority score"""
if priority_score >= 8:
return "critical"
elif priority_score >= 6:
return "high"
elif priority_score >= 4:
return "medium"
else:
return "low"
async def get_workload_categories(self, pods: List[PodResource]) -> List[Any]:
"""Get workload categories for all pods"""
return await self.smart_recommendations.categorize_workloads(pods)
async def get_smart_recommendations(self, pods: List[PodResource]) -> List[Any]:
"""Get smart recommendations for all workloads"""
categories = await self.get_workload_categories(pods)
return await self.smart_recommendations.generate_smart_recommendations(pods, categories)