Initial commit: OpenShift Resource Governance Tool

- Implementa ferramenta completa de governança de recursos
- Backend Python com FastAPI para coleta de dados
- Validações seguindo best practices Red Hat
- Integração com Prometheus e VPA
- UI web interativa para visualização
- Relatórios em JSON, CSV e PDF
- Deploy como DaemonSet com RBAC
- Scripts de automação para build e deploy
This commit is contained in:
2025-09-25 14:26:24 -03:00
commit 4d60c0e039
31 changed files with 3386 additions and 0 deletions

1
app/services/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Services

View File

@@ -0,0 +1,306 @@
"""
Serviço de geração de relatórios
"""
import logging
import json
import csv
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
from io import StringIO
from app.models.resource_models import (
ClusterReport, NamespaceReport, ResourceValidation,
VPARecommendation, ExportRequest
)
from app.core.config import settings
logger = logging.getLogger(__name__)
class ReportService:
"""Serviço para geração de relatórios"""
def __init__(self):
self.export_path = settings.report_export_path
os.makedirs(self.export_path, exist_ok=True)
def generate_cluster_report(
self,
pods: List[Any],
validations: List[ResourceValidation],
vpa_recommendations: List[VPARecommendation],
overcommit_info: Dict[str, Any],
nodes_info: List[Dict[str, Any]]
) -> ClusterReport:
"""Gerar relatório do cluster"""
# Contar namespaces únicos
namespaces = set(pod.namespace for pod in pods)
# Gerar resumo
summary = self._generate_summary(validations, vpa_recommendations, overcommit_info)
report = ClusterReport(
timestamp=datetime.now().isoformat(),
total_pods=len(pods),
total_namespaces=len(namespaces),
total_nodes=len(nodes_info),
validations=validations,
vpa_recommendations=vpa_recommendations,
overcommit_info=overcommit_info,
summary=summary
)
return report
def generate_namespace_report(
self,
namespace: str,
pods: List[Any],
validations: List[ResourceValidation],
resource_usage: Dict[str, Any]
) -> NamespaceReport:
"""Gerar relatório de um namespace"""
# Filtrar validações do namespace
namespace_validations = [
v for v in validations if v.namespace == namespace
]
# Gerar recomendações
recommendations = self._generate_namespace_recommendations(namespace_validations)
report = NamespaceReport(
namespace=namespace,
timestamp=datetime.now().isoformat(),
total_pods=len(pods),
validations=namespace_validations,
resource_usage=resource_usage,
recommendations=recommendations
)
return report
def _generate_summary(
self,
validations: List[ResourceValidation],
vpa_recommendations: List[VPARecommendation],
overcommit_info: Dict[str, Any]
) -> Dict[str, Any]:
"""Gerar resumo do relatório"""
# Contar validações por severidade
severity_counts = {}
for validation in validations:
severity = validation.severity
if severity not in severity_counts:
severity_counts[severity] = 0
severity_counts[severity] += 1
# Contar validações por tipo
type_counts = {}
for validation in validations:
validation_type = validation.validation_type
if validation_type not in type_counts:
type_counts[validation_type] = 0
type_counts[validation_type] += 1
return {
"total_validations": len(validations),
"severity_breakdown": severity_counts,
"validation_types": type_counts,
"vpa_recommendations_count": len(vpa_recommendations),
"overcommit_detected": overcommit_info.get("overcommit_detected", False),
"critical_issues": severity_counts.get("critical", 0),
"warnings": severity_counts.get("warning", 0),
"errors": severity_counts.get("error", 0)
}
def _generate_namespace_recommendations(
self,
validations: List[ResourceValidation]
) -> List[str]:
"""Gerar recomendações para um namespace"""
recommendations = []
# Agrupar por tipo de problema
problems = {}
for validation in validations:
problem_type = validation.validation_type
if problem_type not in problems:
problems[problem_type] = []
problems[problem_type].append(validation)
# Gerar recomendações específicas
if "missing_requests" in problems:
count = len(problems["missing_requests"])
recommendations.append(
f"Criar LimitRange para definir requests padrão "
f"({count} containers sem requests)"
)
if "missing_limits" in problems:
count = len(problems["missing_limits"])
recommendations.append(
f"Definir limits para {count} containers para evitar consumo excessivo"
)
if "invalid_ratio" in problems:
count = len(problems["invalid_ratio"])
recommendations.append(
f"Ajustar ratio limit:request para {count} containers"
)
if "overcommit" in problems:
recommendations.append(
"Resolver overcommit de recursos no namespace"
)
return recommendations
async def export_report(
self,
report: ClusterReport,
export_request: ExportRequest
) -> str:
"""Exportar relatório em diferentes formatos"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if export_request.format == "json":
return await self._export_json(report, timestamp)
elif export_request.format == "csv":
return await self._export_csv(report, timestamp)
elif export_request.format == "pdf":
return await self._export_pdf(report, timestamp)
else:
raise ValueError(f"Formato não suportado: {export_request.format}")
async def _export_json(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em JSON"""
filename = f"cluster_report_{timestamp}.json"
filepath = os.path.join(self.export_path, filename)
# Converter para dict para serialização
report_dict = report.dict()
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(report_dict, f, indent=2, ensure_ascii=False)
logger.info(f"Relatório JSON exportado: {filepath}")
return filepath
async def _export_csv(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em CSV"""
filename = f"cluster_report_{timestamp}.csv"
filepath = os.path.join(self.export_path, filename)
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# Cabeçalho
writer.writerow([
"Pod Name", "Namespace", "Container Name",
"Validation Type", "Severity", "Message", "Recommendation"
])
# Dados das validações
for validation in report.validations:
writer.writerow([
validation.pod_name,
validation.namespace,
validation.container_name,
validation.validation_type,
validation.severity,
validation.message,
validation.recommendation or ""
])
logger.info(f"Relatório CSV exportado: {filepath}")
return filepath
async def _export_pdf(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em PDF"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
filename = f"cluster_report_{timestamp}.pdf"
filepath = os.path.join(self.export_path, filename)
doc = SimpleDocTemplate(filepath, pagesize=letter)
styles = getSampleStyleSheet()
story = []
# Título
title = Paragraph("OpenShift Resource Governance Report", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))
# Resumo
summary_text = f"""
<b>Resumo do Cluster:</b><br/>
Total de Pods: {report.total_pods}<br/>
Total de Namespaces: {report.total_namespaces}<br/>
Total de Nós: {report.total_nodes}<br/>
Total de Validações: {report.summary['total_validations']}<br/>
Problemas Críticos: {report.summary['critical_issues']}<br/>
"""
story.append(Paragraph(summary_text, styles['Normal']))
story.append(Spacer(1, 12))
# Tabela de validações
if report.validations:
data = [["Pod", "Namespace", "Container", "Tipo", "Severidade", "Mensagem"]]
for validation in report.validations[:50]: # Limitar a 50 para PDF
data.append([
validation.pod_name,
validation.namespace,
validation.container_name,
validation.validation_type,
validation.severity,
validation.message[:50] + "..." if len(validation.message) > 50 else validation.message
])
table = Table(data)
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 14),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(Paragraph("<b>Validações:</b>", styles['Heading2']))
story.append(table)
doc.build(story)
logger.info(f"Relatório PDF exportado: {filepath}")
return filepath
except ImportError:
logger.error("reportlab não instalado. Instale com: pip install reportlab")
raise ValueError("PDF export requer reportlab")
def get_exported_reports(self) -> List[Dict[str, str]]:
"""Listar relatórios exportados"""
reports = []
for filename in os.listdir(self.export_path):
if filename.endswith(('.json', '.csv', '.pdf')):
filepath = os.path.join(self.export_path, filename)
stat = os.stat(filepath)
reports.append({
"filename": filename,
"filepath": filepath,
"size": stat.st_size,
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
"format": filename.split('.')[-1]
})
return sorted(reports, key=lambda x: x["created"], reverse=True)

View File

@@ -0,0 +1,345 @@
"""
Serviço de validação de recursos seguindo best practices Red Hat
"""
import logging
from typing import List, Dict, Any
from decimal import Decimal, InvalidOperation
import re
from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources
from app.core.config import settings
logger = logging.getLogger(__name__)
class ValidationService:
"""Serviço para validação de recursos"""
def __init__(self):
self.cpu_ratio = settings.cpu_limit_ratio
self.memory_ratio = settings.memory_limit_ratio
self.min_cpu_request = settings.min_cpu_request
self.min_memory_request = settings.min_memory_request
def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]:
"""Validar recursos de um pod"""
validations = []
for container in pod.containers:
container_validations = self._validate_container_resources(
pod.name, pod.namespace, container
)
validations.extend(container_validations)
return validations
def _validate_container_resources(
self,
pod_name: str,
namespace: str,
container: Dict[str, Any]
) -> List[ResourceValidation]:
"""Validar recursos de um container"""
validations = []
resources = container.get("resources", {})
requests = resources.get("requests", {})
limits = resources.get("limits", {})
# 1. Verificar se requests estão definidos
if not requests:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container["name"],
validation_type="missing_requests",
severity="error",
message="Container sem requests definidos",
recommendation="Definir requests de CPU e memória para garantir QoS"
))
# 2. Verificar se limits estão definidos
if not limits:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container["name"],
validation_type="missing_limits",
severity="warning",
message="Container sem limits definidos",
recommendation="Definir limits para evitar consumo excessivo de recursos"
))
# 3. Validar ratio limit:request
if requests and limits:
cpu_validation = self._validate_cpu_ratio(
pod_name, namespace, container["name"], requests, limits
)
if cpu_validation:
validations.append(cpu_validation)
memory_validation = self._validate_memory_ratio(
pod_name, namespace, container["name"], requests, limits
)
if memory_validation:
validations.append(memory_validation)
# 4. Validar valores mínimos
if requests:
min_validation = self._validate_minimum_values(
pod_name, namespace, container["name"], requests
)
validations.extend(min_validation)
return validations
def _validate_cpu_ratio(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str],
limits: Dict[str, str]
) -> ResourceValidation:
"""Validar ratio CPU limit:request"""
if "cpu" not in requests or "cpu" not in limits:
return None
try:
request_value = self._parse_cpu_value(requests["cpu"])
limit_value = self._parse_cpu_value(limits["cpu"])
if request_value > 0:
ratio = limit_value / request_value
if ratio > self.cpu_ratio * 1.5: # 50% de tolerância
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="warning",
message=f"Ratio CPU limit:request muito alto ({ratio:.2f}:1)",
recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.cpu_ratio}:1)"
)
elif ratio < 1.0:
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="error",
message=f"CPU limit menor que request ({ratio:.2f}:1)",
recommendation="CPU limit deve ser maior ou igual ao request"
)
except (ValueError, InvalidOperation) as e:
logger.warning(f"Erro ao validar ratio CPU: {e}")
return None
def _validate_memory_ratio(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str],
limits: Dict[str, str]
) -> ResourceValidation:
"""Validar ratio memória limit:request"""
if "memory" not in requests or "memory" not in limits:
return None
try:
request_value = self._parse_memory_value(requests["memory"])
limit_value = self._parse_memory_value(limits["memory"])
if request_value > 0:
ratio = limit_value / request_value
if ratio > self.memory_ratio * 1.5: # 50% de tolerância
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="warning",
message=f"Ratio memória limit:request muito alto ({ratio:.2f}:1)",
recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.memory_ratio}:1)"
)
elif ratio < 1.0:
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="error",
message=f"Memória limit menor que request ({ratio:.2f}:1)",
recommendation="Memória limit deve ser maior ou igual ao request"
)
except (ValueError, InvalidOperation) as e:
logger.warning(f"Erro ao validar ratio memória: {e}")
return None
def _validate_minimum_values(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str]
) -> List[ResourceValidation]:
"""Validar valores mínimos de requests"""
validations = []
# Validar CPU mínima
if "cpu" in requests:
try:
request_value = self._parse_cpu_value(requests["cpu"])
min_value = self._parse_cpu_value(self.min_cpu_request)
if request_value < min_value:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="minimum_value",
severity="warning",
message=f"CPU request muito baixo ({requests['cpu']})",
recommendation=f"Considerar aumentar para pelo menos {self.min_cpu_request}"
))
except (ValueError, InvalidOperation):
pass
# Validar memória mínima
if "memory" in requests:
try:
request_value = self._parse_memory_value(requests["memory"])
min_value = self._parse_memory_value(self.min_memory_request)
if request_value < min_value:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="minimum_value",
severity="warning",
message=f"Memória request muito baixa ({requests['memory']})",
recommendation=f"Considerar aumentar para pelo menos {self.min_memory_request}"
))
except (ValueError, InvalidOperation):
pass
return validations
def _parse_cpu_value(self, value: str) -> float:
"""Converter valor de CPU para float (cores)"""
if value.endswith('m'):
return float(value[:-1]) / 1000
elif value.endswith('n'):
return float(value[:-1]) / 1000000000
else:
return float(value)
def _parse_memory_value(self, value: str) -> int:
"""Converter valor de memória para bytes"""
value = value.upper()
if value.endswith('KI'):
return int(float(value[:-2]) * 1024)
elif value.endswith('MI'):
return int(float(value[:-2]) * 1024 * 1024)
elif value.endswith('GI'):
return int(float(value[:-2]) * 1024 * 1024 * 1024)
elif value.endswith('K'):
return int(float(value[:-1]) * 1000)
elif value.endswith('M'):
return int(float(value[:-1]) * 1000 * 1000)
elif value.endswith('G'):
return int(float(value[:-1]) * 1000 * 1000 * 1000)
else:
return int(value)
def validate_namespace_overcommit(
self,
namespace_resources: NamespaceResources,
node_capacity: Dict[str, str]
) -> List[ResourceValidation]:
"""Validar overcommit em um namespace"""
validations = []
# Calcular total de requests do namespace
total_cpu_requests = self._parse_cpu_value(namespace_resources.total_cpu_requests)
total_memory_requests = self._parse_memory_value(namespace_resources.total_memory_requests)
# Calcular capacidade total dos nós
total_cpu_capacity = self._parse_cpu_value(node_capacity.get("cpu", "0"))
total_memory_capacity = self._parse_memory_value(node_capacity.get("memory", "0"))
# Verificar overcommit de CPU
if total_cpu_capacity > 0:
cpu_utilization = (total_cpu_requests / total_cpu_capacity) * 100
if cpu_utilization > 100:
validations.append(ResourceValidation(
pod_name="namespace",
namespace=namespace_resources.name,
container_name="all",
validation_type="overcommit",
severity="critical",
message=f"Overcommit de CPU no namespace: {cpu_utilization:.1f}%",
recommendation="Reduzir requests de CPU ou adicionar mais nós ao cluster"
))
# Verificar overcommit de memória
if total_memory_capacity > 0:
memory_utilization = (total_memory_requests / total_memory_capacity) * 100
if memory_utilization > 100:
validations.append(ResourceValidation(
pod_name="namespace",
namespace=namespace_resources.name,
container_name="all",
validation_type="overcommit",
severity="critical",
message=f"Overcommit de memória no namespace: {memory_utilization:.1f}%",
recommendation="Reduzir requests de memória ou adicionar mais nós ao cluster"
))
return validations
def generate_recommendations(self, validations: List[ResourceValidation]) -> List[str]:
"""Gerar recomendações baseadas nas validações"""
recommendations = []
# Agrupar validações por tipo
validation_counts = {}
for validation in validations:
validation_type = validation.validation_type
if validation_type not in validation_counts:
validation_counts[validation_type] = 0
validation_counts[validation_type] += 1
# Gerar recomendações baseadas nos problemas encontrados
if validation_counts.get("missing_requests", 0) > 0:
recommendations.append(
f"Implementar LimitRange no namespace para definir requests padrão "
f"({validation_counts['missing_requests']} containers sem requests)"
)
if validation_counts.get("missing_limits", 0) > 0:
recommendations.append(
f"Definir limits para {validation_counts['missing_limits']} containers "
"para evitar consumo excessivo de recursos"
)
if validation_counts.get("invalid_ratio", 0) > 0:
recommendations.append(
f"Ajustar ratio limit:request para {validation_counts['invalid_ratio']} containers "
f"(recomendado: {self.cpu_ratio}:1)"
)
if validation_counts.get("overcommit", 0) > 0:
recommendations.append(
f"Resolver overcommit em {validation_counts['overcommit']} namespaces "
"para evitar problemas de performance"
)
return recommendations