feat: implement batch processing for large clusters (100 pods per batch) with memory optimization and progress tracking
This commit is contained in:
284
app/services/batch_processing.py
Normal file
284
app/services/batch_processing.py
Normal file
@@ -0,0 +1,284 @@
|
||||
"""
|
||||
Batch Processing Service for Large Clusters
|
||||
|
||||
This service implements intelligent batch processing to handle large clusters
|
||||
efficiently by processing pods in batches of 100, reducing memory usage and
|
||||
improving performance for clusters with 10,000+ pods.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import List, Dict, Any, Optional, AsyncGenerator, Tuple
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
import gc
|
||||
|
||||
from app.core.kubernetes_client import K8sClient, PodResource
|
||||
from app.services.validation_service import ValidationService
|
||||
from app.services.smart_recommendations import SmartRecommendationsService
|
||||
from app.services.historical_analysis import HistoricalAnalysisService
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class BatchResult:
|
||||
"""Result of a batch processing operation"""
|
||||
batch_number: int
|
||||
total_batches: int
|
||||
pods_processed: int
|
||||
validations: List[Dict[str, Any]]
|
||||
recommendations: List[Dict[str, Any]]
|
||||
processing_time: float
|
||||
memory_usage: float
|
||||
errors: List[str]
|
||||
|
||||
@dataclass
|
||||
class BatchProgress:
|
||||
"""Progress tracking for batch processing"""
|
||||
current_batch: int
|
||||
total_batches: int
|
||||
pods_processed: int
|
||||
total_pods: int
|
||||
validations_found: int
|
||||
recommendations_generated: int
|
||||
processing_time: float
|
||||
estimated_completion: Optional[datetime]
|
||||
status: str # 'running', 'completed', 'error', 'paused'
|
||||
|
||||
class BatchProcessingService:
|
||||
"""Service for processing large clusters in batches"""
|
||||
|
||||
def __init__(self, batch_size: int = 100):
|
||||
self.batch_size = batch_size
|
||||
self.validation_service = ValidationService()
|
||||
self.smart_recommendations_service = SmartRecommendationsService()
|
||||
self.historical_service = HistoricalAnalysisService()
|
||||
|
||||
async def process_cluster_in_batches(
|
||||
self,
|
||||
k8s_client: K8sClient,
|
||||
namespace: Optional[str] = None,
|
||||
include_system_namespaces: bool = False,
|
||||
progress_callback: Optional[callable] = None
|
||||
) -> AsyncGenerator[BatchResult, None]:
|
||||
"""
|
||||
Process cluster pods in batches with progress tracking
|
||||
|
||||
Args:
|
||||
k8s_client: Kubernetes client instance
|
||||
namespace: Optional namespace filter
|
||||
include_system_namespaces: Whether to include system namespaces
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Yields:
|
||||
BatchResult: Results for each batch processed
|
||||
"""
|
||||
try:
|
||||
# Get all pods
|
||||
if namespace:
|
||||
namespace_resources = await k8s_client.get_namespace_resources(namespace)
|
||||
all_pods = namespace_resources.pods
|
||||
else:
|
||||
all_pods = await k8s_client.get_all_pods(include_system_namespaces=include_system_namespaces)
|
||||
|
||||
total_pods = len(all_pods)
|
||||
total_batches = (total_pods + self.batch_size - 1) // self.batch_size
|
||||
|
||||
logger.info(f"Starting batch processing: {total_pods} pods in {total_batches} batches of {self.batch_size}")
|
||||
|
||||
# Process pods in batches
|
||||
for batch_num in range(total_batches):
|
||||
start_idx = batch_num * self.batch_size
|
||||
end_idx = min(start_idx + self.batch_size, total_pods)
|
||||
batch_pods = all_pods[start_idx:end_idx]
|
||||
|
||||
# Process this batch
|
||||
batch_result = await self._process_batch(
|
||||
batch_num + 1,
|
||||
total_batches,
|
||||
batch_pods,
|
||||
start_idx,
|
||||
total_pods
|
||||
)
|
||||
|
||||
# Update progress
|
||||
if progress_callback:
|
||||
progress = BatchProgress(
|
||||
current_batch=batch_num + 1,
|
||||
total_batches=total_batches,
|
||||
pods_processed=end_idx,
|
||||
total_pods=total_pods,
|
||||
validations_found=sum(len(r.validations) for r in batch_result),
|
||||
recommendations_generated=sum(len(r.recommendations) for r in batch_result),
|
||||
processing_time=batch_result.processing_time,
|
||||
estimated_completion=None, # Could calculate based on avg time
|
||||
status='running'
|
||||
)
|
||||
progress_callback(progress)
|
||||
|
||||
yield batch_result
|
||||
|
||||
# Memory cleanup after each batch
|
||||
await self._cleanup_memory()
|
||||
|
||||
# Small delay to prevent overwhelming the system
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in batch processing: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
async def _process_batch(
|
||||
self,
|
||||
batch_number: int,
|
||||
total_batches: int,
|
||||
pods: List[PodResource],
|
||||
start_idx: int,
|
||||
total_pods: int
|
||||
) -> BatchResult:
|
||||
"""Process a single batch of pods"""
|
||||
start_time = datetime.now()
|
||||
errors = []
|
||||
validations = []
|
||||
recommendations = []
|
||||
|
||||
try:
|
||||
logger.info(f"Processing batch {batch_number}/{total_batches}: {len(pods)} pods")
|
||||
|
||||
# Process validations for this batch
|
||||
for pod in pods:
|
||||
try:
|
||||
pod_validations = self.validation_service.validate_pod_resources(pod)
|
||||
for validation in pod_validations:
|
||||
validations.append({
|
||||
'pod_name': validation.pod_name,
|
||||
'namespace': validation.namespace,
|
||||
'container_name': validation.container_name,
|
||||
'validation_type': validation.validation_type,
|
||||
'severity': validation.severity,
|
||||
'message': validation.message,
|
||||
'recommendation': validation.recommendation,
|
||||
'priority_score': validation.priority_score,
|
||||
'workload_category': validation.workload_category,
|
||||
'estimated_impact': validation.estimated_impact
|
||||
})
|
||||
except Exception as e:
|
||||
error_msg = f"Error validating pod {pod.name}: {str(e)}"
|
||||
logger.warning(error_msg)
|
||||
errors.append(error_msg)
|
||||
|
||||
# Generate smart recommendations for this batch
|
||||
try:
|
||||
batch_recommendations = await self.smart_recommendations_service.generate_smart_recommendations(pods, [])
|
||||
for rec in batch_recommendations:
|
||||
recommendations.append({
|
||||
'workload_name': rec.workload_name,
|
||||
'namespace': rec.namespace,
|
||||
'recommendation_type': rec.recommendation_type,
|
||||
'priority_score': rec.priority_score,
|
||||
'title': rec.title,
|
||||
'description': rec.description,
|
||||
'estimated_impact': rec.estimated_impact,
|
||||
'implementation_effort': rec.implementation_effort
|
||||
})
|
||||
except Exception as e:
|
||||
error_msg = f"Error generating recommendations for batch {batch_number}: {str(e)}"
|
||||
logger.warning(error_msg)
|
||||
errors.append(error_msg)
|
||||
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
return BatchResult(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
pods_processed=len(pods),
|
||||
validations=validations,
|
||||
recommendations=recommendations,
|
||||
processing_time=processing_time,
|
||||
memory_usage=self._get_memory_usage(),
|
||||
errors=errors
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
error_msg = f"Error processing batch {batch_number}: {str(e)}"
|
||||
logger.error(error_msg, exc_info=True)
|
||||
|
||||
return BatchResult(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
pods_processed=len(pods),
|
||||
validations=[],
|
||||
recommendations=[],
|
||||
processing_time=processing_time,
|
||||
memory_usage=self._get_memory_usage(),
|
||||
errors=[error_msg]
|
||||
)
|
||||
|
||||
async def _cleanup_memory(self):
|
||||
"""Clean up memory after each batch"""
|
||||
try:
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
|
||||
# Small delay to allow memory cleanup
|
||||
await asyncio.sleep(0.01)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error during memory cleanup: {e}")
|
||||
|
||||
def _get_memory_usage(self) -> float:
|
||||
"""Get current memory usage in MB"""
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
return process.memory_info().rss / 1024 / 1024 # Convert to MB
|
||||
except ImportError:
|
||||
return 0.0
|
||||
except Exception:
|
||||
return 0.0
|
||||
|
||||
async def get_batch_statistics(self, k8s_client: K8sClient) -> Dict[str, Any]:
|
||||
"""Get statistics about batch processing for the cluster"""
|
||||
try:
|
||||
all_pods = await k8s_client.get_all_pods(include_system_namespaces=False)
|
||||
total_pods = len(all_pods)
|
||||
total_batches = (total_pods + self.batch_size - 1) // self.batch_size
|
||||
|
||||
# Group by namespace
|
||||
namespace_counts = {}
|
||||
for pod in all_pods:
|
||||
namespace_counts[pod.namespace] = namespace_counts.get(pod.namespace, 0) + 1
|
||||
|
||||
return {
|
||||
'total_pods': total_pods,
|
||||
'total_namespaces': len(namespace_counts),
|
||||
'batch_size': self.batch_size,
|
||||
'total_batches': total_batches,
|
||||
'estimated_processing_time': total_batches * 2.0, # 2 seconds per batch estimate
|
||||
'namespace_distribution': namespace_counts,
|
||||
'memory_efficiency': 'High' if total_batches > 10 else 'Standard',
|
||||
'recommended_batch_size': self._recommend_batch_size(total_pods)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting batch statistics: {e}", exc_info=True)
|
||||
return {
|
||||
'error': str(e),
|
||||
'total_pods': 0,
|
||||
'total_batches': 0
|
||||
}
|
||||
|
||||
def _recommend_batch_size(self, total_pods: int) -> int:
|
||||
"""Recommend optimal batch size based on cluster size"""
|
||||
if total_pods < 1000:
|
||||
return 50
|
||||
elif total_pods < 5000:
|
||||
return 100
|
||||
elif total_pods < 10000:
|
||||
return 150
|
||||
else:
|
||||
return 200
|
||||
|
||||
# Global instance
|
||||
batch_processing_service = BatchProcessingService()
|
||||
Reference in New Issue
Block a user