Initial commit: OpenShift Resource Governance Tool

- Implementa ferramenta completa de governança de recursos
- Backend Python com FastAPI para coleta de dados
- Validações seguindo best practices Red Hat
- Integração com Prometheus e VPA
- UI web interativa para visualização
- Relatórios em JSON, CSV e PDF
- Deploy como DaemonSet com RBAC
- Scripts de automação para build e deploy
This commit is contained in:
2025-09-25 14:26:24 -03:00
commit 4d60c0e039
31 changed files with 3386 additions and 0 deletions

23
.env.example Normal file
View File

@@ -0,0 +1,23 @@
# Configurações do OpenShift/Kubernetes
KUBECONFIG_PATH=
CLUSTER_URL=
TOKEN=
# Configurações do Prometheus
PROMETHEUS_URL=http://prometheus.openshift-monitoring.svc.cluster.local:9090
# Configurações de validação
CPU_LIMIT_RATIO=3.0
MEMORY_LIMIT_RATIO=3.0
MIN_CPU_REQUEST=10m
MIN_MEMORY_REQUEST=32Mi
# Namespaces críticos para VPA (separados por vírgula)
CRITICAL_NAMESPACES=openshift-monitoring,openshift-ingress,openshift-apiserver,openshift-controller-manager,openshift-sdn
# Configurações de relatório
REPORT_EXPORT_PATH=/tmp/reports
# Configurações de segurança
ENABLE_RBAC=true
SERVICE_ACCOUNT_NAME=resource-governance-sa

161
.gitignore vendored Normal file
View File

@@ -0,0 +1,161 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# IDE
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
.DS_Store?
._*
.Spotlight-V100
.Trashes
ehthumbs.db
Thumbs.db
# Project specific
reports/
*.json
*.csv
*.pdf
logs/
temp/
tmp/
# Kubernetes
kubeconfig
*.kubeconfig
# Docker
.dockerignore

58
Dockerfile Normal file
View File

@@ -0,0 +1,58 @@
# Multi-stage build para otimizar tamanho da imagem
FROM python:3.11-slim as builder
# Instalar dependências do sistema necessárias para compilação
RUN apt-get update && apt-get install -y \
gcc \
g++ \
&& rm -rf /var/lib/apt/lists/*
# Criar diretório de trabalho
WORKDIR /app
# Copiar requirements e instalar dependências Python
COPY requirements.txt .
RUN pip install --no-cache-dir --user -r requirements.txt
# Stage final - imagem de produção
FROM python:3.11-slim
# Instalar dependências de runtime
RUN apt-get update && apt-get install -y \
curl \
&& rm -rf /var/lib/apt/lists/*
# Criar usuário não-root
RUN groupadd -r appuser && useradd -r -g appuser appuser
# Criar diretórios necessários
RUN mkdir -p /app /tmp/reports && \
chown -R appuser:appuser /app /tmp/reports
# Copiar dependências Python do stage anterior
COPY --from=builder /root/.local /home/appuser/.local
# Definir PATH para incluir dependências locais
ENV PATH=/home/appuser/.local/bin:$PATH
# Definir diretório de trabalho
WORKDIR /app
# Copiar código da aplicação
COPY app/ ./app/
# Alterar propriedade dos arquivos
RUN chown -R appuser:appuser /app
# Mudar para usuário não-root
USER appuser
# Expor porta
EXPOSE 8080
# Health check
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD curl -f http://localhost:8080/health || exit 1
# Comando para executar a aplicação
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"]

139
Makefile Normal file
View File

@@ -0,0 +1,139 @@
# Makefile para OpenShift Resource Governance Tool
# Configurações
IMAGE_NAME = resource-governance
TAG = latest
REGISTRY = quay.io/openshift
FULL_IMAGE_NAME = $(REGISTRY)/$(IMAGE_NAME):$(TAG)
NAMESPACE = resource-governance
# Cores para output
RED = \033[0;31m
GREEN = \033[0;32m
YELLOW = \033[1;33m
BLUE = \033[0;34m
NC = \033[0m # No Color
.PHONY: help build test deploy undeploy clean dev logs status
help: ## Mostrar ajuda
@echo "$(BLUE)OpenShift Resource Governance Tool$(NC)"
@echo ""
@echo "Comandos disponíveis:"
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " $(GREEN)%-15s$(NC) %s\n", $$1, $$2}'
build: ## Build da imagem Docker
@echo "$(YELLOW)📦 Building Docker image...$(NC)"
@./scripts/build.sh $(TAG) $(REGISTRY)
test: ## Testar a aplicação
@echo "$(YELLOW)🧪 Testing application...$(NC)"
@python -c "import app.main; print('$(GREEN)✅ App imports successfully$(NC)')"
@echo "$(YELLOW)🧪 Testing API...$(NC)"
@python -m uvicorn app.main:app --host 0.0.0.0 --port 8080 &
@sleep 5
@curl -f http://localhost:8080/health || (echo "$(RED)❌ Health check failed$(NC)" && exit 1)
@pkill -f uvicorn
@echo "$(GREEN)✅ Tests passed$(NC)"
deploy: ## Deploy no OpenShift
@echo "$(YELLOW)🚀 Deploying to OpenShift...$(NC)"
@./scripts/deploy.sh $(TAG) $(REGISTRY)
undeploy: ## Remover do OpenShift
@echo "$(YELLOW)🗑️ Undeploying from OpenShift...$(NC)"
@./scripts/undeploy.sh
clean: ## Limpar recursos locais
@echo "$(YELLOW)🧹 Cleaning up...$(NC)"
@docker rmi $(FULL_IMAGE_NAME) 2>/dev/null || true
@docker system prune -f
@echo "$(GREEN)✅ Cleanup completed$(NC)"
dev: ## Executar em modo desenvolvimento
@echo "$(YELLOW)🔧 Starting development server...$(NC)"
@python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8080
logs: ## Ver logs da aplicação
@echo "$(YELLOW)📋 Showing application logs...$(NC)"
@oc logs -f daemonset/$(IMAGE_NAME) -n $(NAMESPACE)
status: ## Ver status da aplicação
@echo "$(YELLOW)📊 Application status:$(NC)"
@oc get all -n $(NAMESPACE)
@echo ""
@echo "$(YELLOW)🌐 Route URL:$(NC)"
@oc get route $(IMAGE_NAME)-route -n $(NAMESPACE) -o jsonpath='{.spec.host}' 2>/dev/null || echo "Route not found"
install-deps: ## Instalar dependências Python
@echo "$(YELLOW)📦 Installing Python dependencies...$(NC)"
@pip install -r requirements.txt
@echo "$(GREEN)✅ Dependencies installed$(NC)"
format: ## Formatar código Python
@echo "$(YELLOW)🎨 Formatting Python code...$(NC)"
@python -m black app/
@python -m isort app/
@echo "$(GREEN)✅ Code formatted$(NC)"
lint: ## Verificar código Python
@echo "$(YELLOW)🔍 Linting Python code...$(NC)"
@python -m flake8 app/
@python -m mypy app/
@echo "$(GREEN)✅ Linting completed$(NC)"
security: ## Verificar segurança
@echo "$(YELLOW)🔒 Security check...$(NC)"
@python -m bandit -r app/
@echo "$(GREEN)✅ Security check completed$(NC)"
all: clean install-deps format lint test build ## Executar pipeline completo
# Comandos específicos do OpenShift
oc-login: ## Fazer login no OpenShift
@echo "$(YELLOW)🔐 Logging into OpenShift...$(NC)"
@oc login
oc-projects: ## Listar projetos OpenShift
@echo "$(YELLOW)📋 OpenShift projects:$(NC)"
@oc get projects
oc-ns: ## Criar namespace
@echo "$(YELLOW)📁 Creating namespace...$(NC)"
@oc apply -f k8s/namespace.yaml
oc-rbac: ## Aplicar RBAC
@echo "$(YELLOW)🔐 Applying RBAC...$(NC)"
@oc apply -f k8s/rbac.yaml
oc-config: ## Aplicar ConfigMap
@echo "$(YELLOW)⚙️ Applying ConfigMap...$(NC)"
@oc apply -f k8s/configmap.yaml
oc-deploy: ## Aplicar DaemonSet
@echo "$(YELLOW)📦 Applying DaemonSet...$(NC)"
@oc apply -f k8s/daemonset.yaml
oc-service: ## Aplicar Service
@echo "$(YELLOW)🌐 Applying Service...$(NC)"
@oc apply -f k8s/service.yaml
oc-route: ## Aplicar Route
@echo "$(YELLOW)🛣️ Applying Route...$(NC)"
@oc apply -f k8s/route.yaml
oc-apply: oc-ns oc-rbac oc-config oc-deploy oc-service oc-route ## Aplicar todos os recursos
# Comandos de monitoramento
monitor: ## Monitorar aplicação
@echo "$(YELLOW)📊 Monitoring application...$(NC)"
@watch -n 5 'oc get pods -n $(NAMESPACE) && echo "" && oc get route $(IMAGE_NAME)-route -n $(NAMESPACE)'
health: ## Verificar saúde da aplicação
@echo "$(YELLOW)🏥 Health check...$(NC)"
@ROUTE_URL=$$(oc get route $(IMAGE_NAME)-route -n $(NAMESPACE) -o jsonpath='{.spec.host}' 2>/dev/null); \
if [ -n "$$ROUTE_URL" ]; then \
curl -f https://$$ROUTE_URL/health || echo "$(RED)❌ Health check failed$(NC)"; \
else \
echo "$(RED)❌ Route not found$(NC)"; \
fi

301
README.md Normal file
View File

@@ -0,0 +1,301 @@
# OpenShift Resource Governance Tool
Uma ferramenta de governança de recursos para clusters OpenShift que vai além do que o Metrics Server e VPA oferecem, fornecendo validações, relatórios e recomendações consolidadas.
## 🚀 Características
- **Coleta Automática**: Coleta requests/limits de todos os pods/containers no cluster
- **Validações Red Hat**: Valida best practices de capacity management
- **Integração VPA**: Consome recomendações do VPA em modo Off
- **Integração Prometheus**: Coleta métricas reais de consumo
- **Relatórios Consolidados**: Gera relatórios em JSON, CSV e PDF
- **UI Web**: Interface simples para visualização e interação
- **Aplicação de Recomendações**: Permite aprovar e aplicar recomendações
## 📋 Requisitos
- OpenShift 4.x
- Prometheus (nativo no OCP)
- VPA (opcional, para recomendações)
- Python 3.11+
- Docker
- OpenShift CLI (oc)
## 🛠️ Instalação
### 1. Build da Imagem
```bash
# Build local
./scripts/build.sh
# Build com tag específica
./scripts/build.sh v1.0.0
# Build para registry específico
./scripts/build.sh latest quay.io/seu-usuario
```
### 2. Deploy no OpenShift
```bash
# Deploy padrão
./scripts/deploy.sh
# Deploy com tag específica
./scripts/deploy.sh v1.0.0
# Deploy para registry específico
./scripts/deploy.sh latest quay.io/seu-usuario
```
### 3. Acesso à Aplicação
Após o deploy, acesse a aplicação através da rota criada:
```bash
# Obter URL da rota
oc get route resource-governance-route -n resource-governance
# Acessar via browser
# https://resource-governance-route-resource-governance.apps.openshift.local
```
## 🔧 Configuração
### ConfigMap
A aplicação é configurada através do ConfigMap `resource-governance-config`:
```yaml
data:
CPU_LIMIT_RATIO: "3.0" # Ratio padrão limit:request para CPU
MEMORY_LIMIT_RATIO: "3.0" # Ratio padrão limit:request para memória
MIN_CPU_REQUEST: "10m" # Mínimo de CPU request
MIN_MEMORY_REQUEST: "32Mi" # Mínimo de memória request
CRITICAL_NAMESPACES: | # Namespaces críticos para VPA
openshift-monitoring
openshift-ingress
openshift-apiserver
PROMETHEUS_URL: "http://prometheus.openshift-monitoring.svc.cluster.local:9090"
```
### Variáveis de Ambiente
- `KUBECONFIG`: Caminho para kubeconfig (usado em desenvolvimento)
- `PROMETHEUS_URL`: URL do Prometheus
- `CPU_LIMIT_RATIO`: Ratio CPU limit:request
- `MEMORY_LIMIT_RATIO`: Ratio memória limit:request
- `MIN_CPU_REQUEST`: Mínimo de CPU request
- `MIN_MEMORY_REQUEST`: Mínimo de memória request
## 📊 Uso
### API Endpoints
#### Status do Cluster
```bash
GET /api/v1/cluster/status
```
#### Status de Namespace
```bash
GET /api/v1/namespace/{namespace}/status
```
#### Validações
```bash
GET /api/v1/validations?namespace=default&severity=error
```
#### Recomendações VPA
```bash
GET /api/v1/vpa/recommendations?namespace=default
```
#### Exportar Relatório
```bash
POST /api/v1/export
Content-Type: application/json
{
"format": "json",
"namespaces": ["default", "kube-system"],
"includeVPA": true,
"includeValidations": true
}
```
### Exemplos de Uso
#### 1. Verificar Status do Cluster
```bash
curl https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/cluster/status
```
#### 2. Exportar Relatório CSV
```bash
curl -X POST https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/export \
-H "Content-Type: application/json" \
-d '{"format": "csv", "includeVPA": true}'
```
#### 3. Ver Validações Críticas
```bash
curl "https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/validations?severity=critical"
```
## 🔍 Validações Implementadas
### 1. Requests Obrigatórios
- **Problema**: Pods sem requests definidos
- **Severidade**: Error
- **Recomendação**: Definir requests de CPU e memória
### 2. Limits Recomendados
- **Problema**: Pods sem limits definidos
- **Severidade**: Warning
- **Recomendação**: Definir limits para evitar consumo excessivo
### 3. Ratio Limit:Request
- **Problema**: Ratio muito alto ou baixo
- **Severidade**: Warning/Error
- **Recomendação**: Ajustar para ratio 3:1
### 4. Valores Mínimos
- **Problema**: Requests muito baixos
- **Severidade**: Warning
- **Recomendação**: Aumentar para valores mínimos
### 5. Overcommit
- **Problema**: Requests excedem capacidade do cluster
- **Severidade**: Critical
- **Recomendação**: Reduzir requests ou adicionar nós
## 📈 Relatórios
### Formato JSON
```json
{
"timestamp": "2024-01-15T10:30:00Z",
"total_pods": 150,
"total_namespaces": 25,
"total_nodes": 3,
"validations": [...],
"vpa_recommendations": [...],
"summary": {
"total_validations": 45,
"critical_issues": 5,
"warnings": 25,
"errors": 15
}
}
```
### Formato CSV
```csv
Pod Name,Namespace,Container Name,Validation Type,Severity,Message,Recommendation
pod-1,default,nginx,missing_requests,error,Container sem requests definidos,Definir requests de CPU e memória
```
## 🔐 Segurança
### RBAC
A aplicação usa um ServiceAccount dedicado com permissões mínimas:
- **Pods**: get, list, watch, patch, update
- **Namespaces**: get, list, watch
- **Nodes**: get, list, watch
- **VPA**: get, list, watch
- **Deployments/ReplicaSets**: get, list, watch, patch, update
### Security Context
- Executa como usuário não-root (UID 1000)
- Usa SecurityContext com runAsNonRoot: true
- Limita recursos com requests/limits
## 🐛 Troubleshooting
### Verificar Logs
```bash
oc logs -f daemonset/resource-governance -n resource-governance
```
### Verificar Status dos Pods
```bash
oc get pods -n resource-governance
oc describe pod <pod-name> -n resource-governance
```
### Verificar RBAC
```bash
oc auth can-i get pods --as=system:serviceaccount:resource-governance:resource-governance-sa
```
### Testar Conectividade
```bash
# Health check
curl https://resource-governance-route-resource-governance.apps.openshift.local/health
# Teste de API
curl https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/cluster/status
```
## 🚀 Desenvolvimento
### Executar Localmente
```bash
# Instalar dependências
pip install -r requirements.txt
# Executar aplicação
python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8080
```
### Executar com Docker
```bash
# Build
docker build -t resource-governance .
# Executar
docker run -p 8080:8080 resource-governance
```
### Testes
```bash
# Testar importação
python -c "import app.main; print('OK')"
# Testar API
curl http://localhost:8080/health
```
## 📝 Roadmap
### Próximas Versões
- [ ] UI Web com gráficos interativos
- [ ] Relatórios PDF com gráficos
- [ ] Regras customizadas por namespace
- [ ] Integração com GitOps (ArgoCD)
- [ ] Notificações via Slack/Teams
- [ ] Métricas customizadas do Prometheus
- [ ] Suporte a múltiplos clusters
## 🤝 Contribuição
1. Fork o projeto
2. Crie uma branch para sua feature (`git checkout -b feature/AmazingFeature`)
3. Commit suas mudanças (`git commit -m 'Add some AmazingFeature'`)
4. Push para a branch (`git push origin feature/AmazingFeature`)
5. Abra um Pull Request
## 📄 Licença
Este projeto está sob a licença MIT. Veja o arquivo [LICENSE](LICENSE) para detalhes.
## 📞 Suporte
Para suporte e dúvidas:
- Abra uma issue no GitHub
- Consulte a documentação do OpenShift
- Verifique os logs da aplicação

1
app/__init__.py Normal file
View File

@@ -0,0 +1 @@
# OpenShift Resource Governance Tool

1
app/api/__init__.py Normal file
View File

@@ -0,0 +1 @@
# API routes

292
app/api/routes.py Normal file
View File

@@ -0,0 +1,292 @@
"""
Rotas da API
"""
import logging
from typing import List, Optional
from fastapi import APIRouter, HTTPException, Depends, Request
from fastapi.responses import FileResponse
from app.models.resource_models import (
ClusterReport, NamespaceReport, ExportRequest,
ApplyRecommendationRequest
)
from app.services.validation_service import ValidationService
from app.services.report_service import ReportService
logger = logging.getLogger(__name__)
# Criar router
api_router = APIRouter()
# Inicializar serviços
validation_service = ValidationService()
report_service = ReportService()
def get_k8s_client(request: Request):
"""Dependency para obter cliente Kubernetes"""
return request.app.state.k8s_client
def get_prometheus_client(request: Request):
"""Dependency para obter cliente Prometheus"""
return request.app.state.prometheus_client
@api_router.get("/cluster/status")
async def get_cluster_status(
k8s_client=Depends(get_k8s_client),
prometheus_client=Depends(get_prometheus_client)
):
"""Obter status geral do cluster"""
try:
# Coletar dados básicos
pods = await k8s_client.get_all_pods()
nodes_info = await k8s_client.get_nodes_info()
# Validar recursos
all_validations = []
for pod in pods:
pod_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(pod_validations)
# Obter informações de overcommit
overcommit_info = await prometheus_client.get_cluster_overcommit()
# Obter recomendações VPA
vpa_recommendations = await k8s_client.get_vpa_recommendations()
# Gerar relatório
report = report_service.generate_cluster_report(
pods=pods,
validations=all_validations,
vpa_recommendations=vpa_recommendations,
overcommit_info=overcommit_info,
nodes_info=nodes_info
)
return report
except Exception as e:
logger.error(f"Erro ao obter status do cluster: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/namespace/{namespace}/status")
async def get_namespace_status(
namespace: str,
k8s_client=Depends(get_k8s_client),
prometheus_client=Depends(get_prometheus_client)
):
"""Obter status de um namespace específico"""
try:
# Coletar dados do namespace
namespace_resources = await k8s_client.get_namespace_resources(namespace)
# Validar recursos
all_validations = []
for pod in namespace_resources.pods:
pod_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(pod_validations)
# Obter uso de recursos do Prometheus
resource_usage = await prometheus_client.get_namespace_resource_usage(namespace)
# Gerar relatório do namespace
report = report_service.generate_namespace_report(
namespace=namespace,
pods=namespace_resources.pods,
validations=all_validations,
resource_usage=resource_usage
)
return report
except Exception as e:
logger.error(f"Erro ao obter status do namespace {namespace}: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/pods")
async def get_pods(
namespace: Optional[str] = None,
k8s_client=Depends(get_k8s_client)
):
"""Listar pods com informações de recursos"""
try:
if namespace:
namespace_resources = await k8s_client.get_namespace_resources(namespace)
return namespace_resources.pods
else:
return await k8s_client.get_all_pods()
except Exception as e:
logger.error(f"Erro ao listar pods: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/validations")
async def get_validations(
namespace: Optional[str] = None,
severity: Optional[str] = None,
k8s_client=Depends(get_k8s_client)
):
"""Listar validações de recursos"""
try:
# Coletar pods
if namespace:
namespace_resources = await k8s_client.get_namespace_resources(namespace)
pods = namespace_resources.pods
else:
pods = await k8s_client.get_all_pods()
# Validar recursos
all_validations = []
for pod in pods:
pod_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(pod_validations)
# Filtrar por severidade se especificado
if severity:
all_validations = [
v for v in all_validations if v.severity == severity
]
return all_validations
except Exception as e:
logger.error(f"Erro ao obter validações: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/vpa/recommendations")
async def get_vpa_recommendations(
namespace: Optional[str] = None,
k8s_client=Depends(get_k8s_client)
):
"""Obter recomendações do VPA"""
try:
recommendations = await k8s_client.get_vpa_recommendations()
if namespace:
recommendations = [
r for r in recommendations if r.namespace == namespace
]
return recommendations
except Exception as e:
logger.error(f"Erro ao obter recomendações VPA: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.post("/export")
async def export_report(
export_request: ExportRequest,
k8s_client=Depends(get_k8s_client),
prometheus_client=Depends(get_prometheus_client)
):
"""Exportar relatório em diferentes formatos"""
try:
# Gerar relatório
pods = await k8s_client.get_all_pods()
nodes_info = await k8s_client.get_nodes_info()
# Filtrar por namespaces se especificado
if export_request.namespaces:
pods = [p for p in pods if p.namespace in export_request.namespaces]
# Validar recursos
all_validations = []
for pod in pods:
pod_validations = validation_service.validate_pod_resources(pod)
all_validations.extend(pod_validations)
# Obter informações adicionais
overcommit_info = {}
vpa_recommendations = []
if export_request.include_vpa:
vpa_recommendations = await k8s_client.get_vpa_recommendations()
if export_request.include_validations:
overcommit_info = await prometheus_client.get_cluster_overcommit()
# Gerar relatório
report = report_service.generate_cluster_report(
pods=pods,
validations=all_validations,
vpa_recommendations=vpa_recommendations,
overcommit_info=overcommit_info,
nodes_info=nodes_info
)
# Exportar
filepath = await report_service.export_report(report, export_request)
return {
"message": "Relatório exportado com sucesso",
"filepath": filepath,
"format": export_request.format
}
except Exception as e:
logger.error(f"Erro ao exportar relatório: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/export/files")
async def list_exported_files():
"""Listar arquivos exportados"""
try:
files = report_service.get_exported_reports()
return files
except Exception as e:
logger.error(f"Erro ao listar arquivos exportados: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/export/files/{filename}")
async def download_exported_file(filename: str):
"""Download de arquivo exportado"""
try:
files = report_service.get_exported_reports()
file_info = next((f for f in files if f["filename"] == filename), None)
if not file_info:
raise HTTPException(status_code=404, detail="Arquivo não encontrado")
return FileResponse(
path=file_info["filepath"],
filename=filename,
media_type='application/octet-stream'
)
except Exception as e:
logger.error(f"Erro ao baixar arquivo {filename}: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.post("/apply/recommendation")
async def apply_recommendation(
recommendation: ApplyRecommendationRequest,
k8s_client=Depends(get_k8s_client)
):
"""Aplicar recomendação de recursos"""
try:
# TODO: Implementar aplicação de recomendações
# Por enquanto, apenas simular
if recommendation.dry_run:
return {
"message": "Dry run - recomendação seria aplicada",
"pod": recommendation.pod_name,
"namespace": recommendation.namespace,
"container": recommendation.container_name,
"action": f"{recommendation.action} {recommendation.resource_type} = {recommendation.value}"
}
else:
# Implementar aplicação real da recomendação
raise HTTPException(status_code=501, detail="Aplicação de recomendações não implementada ainda")
except Exception as e:
logger.error(f"Erro ao aplicar recomendação: {e}")
raise HTTPException(status_code=500, detail=str(e))
@api_router.get("/health")
async def health_check():
"""Health check da API"""
return {
"status": "healthy",
"service": "resource-governance-api",
"version": "1.0.0"
}

1
app/core/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Core modules

45
app/core/config.py Normal file
View File

@@ -0,0 +1,45 @@
"""
Configurações da aplicação
"""
import os
from typing import List, Optional
from pydantic import BaseSettings
class Settings(BaseSettings):
"""Configurações da aplicação"""
# Configurações do OpenShift/Kubernetes
kubeconfig_path: Optional[str] = None
cluster_url: Optional[str] = None
token: Optional[str] = None
# Configurações do Prometheus
prometheus_url: str = "http://prometheus.openshift-monitoring.svc.cluster.local:9090"
# Configurações de validação
cpu_limit_ratio: float = 3.0 # Ratio padrão limit:request para CPU
memory_limit_ratio: float = 3.0 # Ratio padrão limit:request para memória
min_cpu_request: str = "10m" # Mínimo de CPU request
min_memory_request: str = "32Mi" # Mínimo de memória request
# Namespaces críticos para VPA
critical_namespaces: List[str] = [
"openshift-monitoring",
"openshift-ingress",
"openshift-apiserver",
"openshift-controller-manager",
"openshift-sdn"
]
# Configurações de relatório
report_export_path: str = "/tmp/reports"
# Configurações de segurança
enable_rbac: bool = True
service_account_name: str = "resource-governance-sa"
class Config:
env_file = ".env"
case_sensitive = False
settings = Settings()

View File

@@ -0,0 +1,234 @@
"""
Cliente Kubernetes/OpenShift para coleta de dados
"""
import logging
from typing import List, Dict, Any, Optional
from kubernetes import client, config
from kubernetes.client.rest import ApiException
import asyncio
import aiohttp
from app.core.config import settings
from app.models.resource_models import PodResource, NamespaceResources, VPARecommendation
logger = logging.getLogger(__name__)
class K8sClient:
"""Cliente para interação com Kubernetes/OpenShift"""
def __init__(self):
self.v1 = None
self.autoscaling_v1 = None
self.apps_v1 = None
self.initialized = False
async def initialize(self):
"""Inicializar cliente Kubernetes"""
try:
# Tentar carregar configuração do cluster
if settings.kubeconfig_path:
config.load_kube_config(config_file=settings.kubeconfig_path)
else:
# Usar configuração in-cluster
config.load_incluster_config()
# Inicializar clientes da API
self.v1 = client.CoreV1Api()
self.autoscaling_v1 = client.AutoscalingV1Api()
self.apps_v1 = client.AppsV1Api()
self.initialized = True
logger.info("Cliente Kubernetes inicializado com sucesso")
except Exception as e:
logger.error(f"Erro ao inicializar cliente Kubernetes: {e}")
raise
async def get_all_pods(self) -> List[PodResource]:
"""Coletar informações de todos os pods do cluster"""
if not self.initialized:
raise RuntimeError("Cliente Kubernetes não inicializado")
pods_data = []
try:
# Listar todos os pods em todos os namespaces
pods = self.v1.list_pod_for_all_namespaces(watch=False)
for pod in pods.items:
pod_resource = PodResource(
name=pod.metadata.name,
namespace=pod.metadata.namespace,
node_name=pod.spec.node_name,
phase=pod.status.phase,
containers=[]
)
# Processar containers do pod
for container in pod.spec.containers:
container_resource = {
"name": container.name,
"image": container.image,
"resources": {
"requests": {},
"limits": {}
}
}
# Extrair requests e limits
if container.resources:
if container.resources.requests:
container_resource["resources"]["requests"] = {
k: v for k, v in container.resources.requests.items()
}
if container.resources.limits:
container_resource["resources"]["limits"] = {
k: v for k, v in container.resources.limits.items()
}
pod_resource.containers.append(container_resource)
pods_data.append(pod_resource)
logger.info(f"Coletados {len(pods_data)} pods")
return pods_data
except ApiException as e:
logger.error(f"Erro ao listar pods: {e}")
raise
async def get_namespace_resources(self, namespace: str) -> NamespaceResources:
"""Coletar recursos de um namespace específico"""
if not self.initialized:
raise RuntimeError("Cliente Kubernetes não inicializado")
try:
# Listar pods do namespace
pods = self.v1.list_namespaced_pod(namespace=namespace)
namespace_resource = NamespaceResources(
name=namespace,
pods=[],
total_cpu_requests="0",
total_cpu_limits="0",
total_memory_requests="0",
total_memory_limits="0"
)
for pod in pods.items:
pod_resource = PodResource(
name=pod.metadata.name,
namespace=pod.metadata.namespace,
node_name=pod.spec.node_name,
phase=pod.status.phase,
containers=[]
)
for container in pod.spec.containers:
container_resource = {
"name": container.name,
"image": container.image,
"resources": {
"requests": {},
"limits": {}
}
}
if container.resources:
if container.resources.requests:
container_resource["resources"]["requests"] = {
k: v for k, v in container.resources.requests.items()
}
if container.resources.limits:
container_resource["resources"]["limits"] = {
k: v for k, v in container.resources.limits.items()
}
pod_resource.containers.append(container_resource)
namespace_resource.pods.append(pod_resource)
return namespace_resource
except ApiException as e:
logger.error(f"Erro ao coletar recursos do namespace {namespace}: {e}")
raise
async def get_vpa_recommendations(self) -> List[VPARecommendation]:
"""Coletar recomendações do VPA"""
if not self.initialized:
raise RuntimeError("Cliente Kubernetes não inicializado")
recommendations = []
try:
# Listar VPA objects em todos os namespaces
vpa_list = self.autoscaling_v1.list_vertical_pod_autoscaler_for_all_namespaces()
for vpa in vpa_list.items:
if vpa.status and vpa.status.recommendation:
recommendation = VPARecommendation(
name=vpa.metadata.name,
namespace=vpa.metadata.namespace,
target_ref=vpa.spec.target_ref,
recommendations=vpa.status.recommendation
)
recommendations.append(recommendation)
logger.info(f"Coletadas {len(recommendations)} recomendações VPA")
return recommendations
except ApiException as e:
logger.error(f"Erro ao coletar recomendações VPA: {e}")
# VPA pode não estar instalado, retornar lista vazia
return []
async def get_nodes_info(self) -> List[Dict[str, Any]]:
"""Coletar informações dos nós do cluster"""
if not self.initialized:
raise RuntimeError("Cliente Kubernetes não inicializado")
try:
nodes = self.v1.list_node()
nodes_info = []
for node in nodes.items:
node_info = {
"name": node.metadata.name,
"labels": node.metadata.labels or {},
"capacity": {},
"allocatable": {},
"conditions": []
}
# Capacidade do nó
if node.status.capacity:
node_info["capacity"] = {
k: v for k, v in node.status.capacity.items()
}
# Recursos alocáveis
if node.status.allocatable:
node_info["allocatable"] = {
k: v for k, v in node.status.allocatable.items()
}
# Condições do nó
if node.status.conditions:
node_info["conditions"] = [
{
"type": condition.type,
"status": condition.status,
"reason": condition.reason,
"message": condition.message
}
for condition in node.status.conditions
]
nodes_info.append(node_info)
return nodes_info
except ApiException as e:
logger.error(f"Erro ao coletar informações dos nós: {e}")
raise

View File

@@ -0,0 +1,131 @@
"""
Cliente Prometheus para coleta de métricas
"""
import logging
import aiohttp
import asyncio
from typing import Dict, List, Any, Optional
from datetime import datetime, timedelta
from app.core.config import settings
logger = logging.getLogger(__name__)
class PrometheusClient:
"""Cliente para interação com Prometheus"""
def __init__(self):
self.base_url = settings.prometheus_url
self.session = None
self.initialized = False
async def initialize(self):
"""Inicializar cliente Prometheus"""
try:
self.session = aiohttp.ClientSession()
# Testar conexão
async with self.session.get(f"{self.base_url}/api/v1/query?query=up") as response:
if response.status == 200:
self.initialized = True
logger.info("Cliente Prometheus inicializado com sucesso")
else:
logger.warning(f"Prometheus retornou status {response.status}")
except Exception as e:
logger.error(f"Erro ao inicializar cliente Prometheus: {e}")
# Prometheus pode não estar disponível, continuar sem ele
self.initialized = False
async def query(self, query: str, time: Optional[datetime] = None) -> Dict[str, Any]:
"""Executar query no Prometheus"""
if not self.initialized or not self.session:
return {"status": "error", "message": "Prometheus não disponível"}
try:
params = {"query": query}
if time:
params["time"] = int(time.timestamp())
async with self.session.get(
f"{self.base_url}/api/v1/query",
params=params
) as response:
if response.status == 200:
data = await response.json()
return data
else:
logger.error(f"Erro na query Prometheus: {response.status}")
return {"status": "error", "message": f"HTTP {response.status}"}
except Exception as e:
logger.error(f"Erro ao executar query Prometheus: {e}")
return {"status": "error", "message": str(e)}
async def get_pod_cpu_usage(self, namespace: str, pod_name: str) -> Dict[str, Any]:
"""Obter uso de CPU de um pod específico"""
query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{pod_name}"}}[5m])'
return await self.query(query)
async def get_pod_memory_usage(self, namespace: str, pod_name: str) -> Dict[str, Any]:
"""Obter uso de memória de um pod específico"""
query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod="{pod_name}"}}'
return await self.query(query)
async def get_namespace_resource_usage(self, namespace: str) -> Dict[str, Any]:
"""Obter uso de recursos de um namespace"""
cpu_query = f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m]))'
memory_query = f'sum(container_memory_working_set_bytes{{namespace="{namespace}"}})'
cpu_result = await self.query(cpu_query)
memory_result = await self.query(memory_query)
return {
"cpu": cpu_result,
"memory": memory_result
}
async def get_cluster_overcommit(self) -> Dict[str, Any]:
"""Verificar overcommit no cluster"""
# CPU overcommit
cpu_capacity_query = 'sum(kube_node_status_capacity{resource="cpu"})'
cpu_requests_query = 'sum(kube_pod_container_resource_requests{resource="cpu"})'
# Memory overcommit
memory_capacity_query = 'sum(kube_node_status_capacity{resource="memory"})'
memory_requests_query = 'sum(kube_pod_container_resource_requests{resource="memory"})'
cpu_capacity = await self.query(cpu_capacity_query)
cpu_requests = await self.query(cpu_requests_query)
memory_capacity = await self.query(memory_capacity_query)
memory_requests = await self.query(memory_requests_query)
return {
"cpu": {
"capacity": cpu_capacity,
"requests": cpu_requests
},
"memory": {
"capacity": memory_capacity,
"requests": memory_requests
}
}
async def get_node_resource_usage(self) -> List[Dict[str, Any]]:
"""Obter uso de recursos por nó"""
query = '''
(
kube_node_status_capacity{resource="cpu"} or
kube_node_status_capacity{resource="memory"} or
kube_pod_container_resource_requests{resource="cpu"} or
kube_pod_container_resource_requests{resource="memory"}
)
'''
result = await self.query(query)
return result
async def close(self):
"""Fechar sessão HTTP"""
if self.session:
await self.session.close()

81
app/main.py Normal file
View File

@@ -0,0 +1,81 @@
"""
OpenShift Resource Governance Tool
Aplicação para governança de recursos no cluster OpenShift
"""
import os
import logging
from fastapi import FastAPI, HTTPException, Depends
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
from contextlib import asynccontextmanager
from app.core.config import settings
from app.api.routes import api_router
from app.core.kubernetes_client import K8sClient
from app.core.prometheus_client import PrometheusClient
# Configuração de logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Inicialização e cleanup da aplicação"""
logger.info("Iniciando OpenShift Resource Governance Tool")
# Inicializar clientes
app.state.k8s_client = K8sClient()
app.state.prometheus_client = PrometheusClient()
try:
await app.state.k8s_client.initialize()
await app.state.prometheus_client.initialize()
logger.info("Clientes inicializados com sucesso")
except Exception as e:
logger.error(f"Erro ao inicializar clientes: {e}")
raise
yield
logger.info("Finalizando aplicação")
# Criar aplicação FastAPI
app = FastAPI(
title="OpenShift Resource Governance Tool",
description="Ferramenta de governança de recursos para clusters OpenShift",
version="1.0.0",
lifespan=lifespan
)
# Incluir rotas da API
app.include_router(api_router, prefix="/api/v1")
# Servir arquivos estáticos
app.mount("/static", StaticFiles(directory="app/static"), name="static")
@app.get("/", response_class=HTMLResponse)
async def root():
"""Página principal da aplicação"""
with open("app/static/index.html", "r") as f:
return HTMLResponse(content=f.read())
@app.get("/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"service": "openshift-resource-governance",
"version": "1.0.0"
}
if __name__ == "__main__":
import uvicorn
uvicorn.run(
"app.main:app",
host="0.0.0.0",
port=8080,
reload=True
)

1
app/models/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Models

View File

@@ -0,0 +1,82 @@
"""
Modelos de dados para recursos Kubernetes
"""
from typing import List, Dict, Any, Optional
from pydantic import BaseModel
class ContainerResource(BaseModel):
"""Recursos de um container"""
name: str
image: str
resources: Dict[str, Dict[str, str]]
class PodResource(BaseModel):
"""Recursos de um pod"""
name: str
namespace: str
node_name: Optional[str] = None
phase: str
containers: List[ContainerResource]
class NamespaceResources(BaseModel):
"""Recursos de um namespace"""
name: str
pods: List[PodResource]
total_cpu_requests: str = "0"
total_cpu_limits: str = "0"
total_memory_requests: str = "0"
total_memory_limits: str = "0"
class VPARecommendation(BaseModel):
"""Recomendação do VPA"""
name: str
namespace: str
target_ref: Dict[str, str]
recommendations: Dict[str, Any]
class ResourceValidation(BaseModel):
"""Resultado de validação de recursos"""
pod_name: str
namespace: str
container_name: str
validation_type: str # "missing_requests", "missing_limits", "invalid_ratio", "overcommit"
severity: str # "warning", "error", "critical"
message: str
recommendation: Optional[str] = None
class ClusterReport(BaseModel):
"""Relatório do cluster"""
timestamp: str
total_pods: int
total_namespaces: int
total_nodes: int
validations: List[ResourceValidation]
vpa_recommendations: List[VPARecommendation]
overcommit_info: Dict[str, Any]
summary: Dict[str, Any]
class NamespaceReport(BaseModel):
"""Relatório de um namespace"""
namespace: str
timestamp: str
total_pods: int
validations: List[ResourceValidation]
resource_usage: Dict[str, Any]
recommendations: List[str]
class ExportRequest(BaseModel):
"""Request para exportar relatório"""
format: str # "json", "csv", "pdf"
namespaces: Optional[List[str]] = None
include_vpa: bool = True
include_validations: bool = True
class ApplyRecommendationRequest(BaseModel):
"""Request para aplicar recomendação"""
pod_name: str
namespace: str
container_name: str
resource_type: str # "cpu", "memory"
action: str # "requests", "limits"
value: str
dry_run: bool = True

1
app/services/__init__.py Normal file
View File

@@ -0,0 +1 @@
# Services

View File

@@ -0,0 +1,306 @@
"""
Serviço de geração de relatórios
"""
import logging
import json
import csv
import os
from datetime import datetime
from typing import List, Dict, Any, Optional
from io import StringIO
from app.models.resource_models import (
ClusterReport, NamespaceReport, ResourceValidation,
VPARecommendation, ExportRequest
)
from app.core.config import settings
logger = logging.getLogger(__name__)
class ReportService:
"""Serviço para geração de relatórios"""
def __init__(self):
self.export_path = settings.report_export_path
os.makedirs(self.export_path, exist_ok=True)
def generate_cluster_report(
self,
pods: List[Any],
validations: List[ResourceValidation],
vpa_recommendations: List[VPARecommendation],
overcommit_info: Dict[str, Any],
nodes_info: List[Dict[str, Any]]
) -> ClusterReport:
"""Gerar relatório do cluster"""
# Contar namespaces únicos
namespaces = set(pod.namespace for pod in pods)
# Gerar resumo
summary = self._generate_summary(validations, vpa_recommendations, overcommit_info)
report = ClusterReport(
timestamp=datetime.now().isoformat(),
total_pods=len(pods),
total_namespaces=len(namespaces),
total_nodes=len(nodes_info),
validations=validations,
vpa_recommendations=vpa_recommendations,
overcommit_info=overcommit_info,
summary=summary
)
return report
def generate_namespace_report(
self,
namespace: str,
pods: List[Any],
validations: List[ResourceValidation],
resource_usage: Dict[str, Any]
) -> NamespaceReport:
"""Gerar relatório de um namespace"""
# Filtrar validações do namespace
namespace_validations = [
v for v in validations if v.namespace == namespace
]
# Gerar recomendações
recommendations = self._generate_namespace_recommendations(namespace_validations)
report = NamespaceReport(
namespace=namespace,
timestamp=datetime.now().isoformat(),
total_pods=len(pods),
validations=namespace_validations,
resource_usage=resource_usage,
recommendations=recommendations
)
return report
def _generate_summary(
self,
validations: List[ResourceValidation],
vpa_recommendations: List[VPARecommendation],
overcommit_info: Dict[str, Any]
) -> Dict[str, Any]:
"""Gerar resumo do relatório"""
# Contar validações por severidade
severity_counts = {}
for validation in validations:
severity = validation.severity
if severity not in severity_counts:
severity_counts[severity] = 0
severity_counts[severity] += 1
# Contar validações por tipo
type_counts = {}
for validation in validations:
validation_type = validation.validation_type
if validation_type not in type_counts:
type_counts[validation_type] = 0
type_counts[validation_type] += 1
return {
"total_validations": len(validations),
"severity_breakdown": severity_counts,
"validation_types": type_counts,
"vpa_recommendations_count": len(vpa_recommendations),
"overcommit_detected": overcommit_info.get("overcommit_detected", False),
"critical_issues": severity_counts.get("critical", 0),
"warnings": severity_counts.get("warning", 0),
"errors": severity_counts.get("error", 0)
}
def _generate_namespace_recommendations(
self,
validations: List[ResourceValidation]
) -> List[str]:
"""Gerar recomendações para um namespace"""
recommendations = []
# Agrupar por tipo de problema
problems = {}
for validation in validations:
problem_type = validation.validation_type
if problem_type not in problems:
problems[problem_type] = []
problems[problem_type].append(validation)
# Gerar recomendações específicas
if "missing_requests" in problems:
count = len(problems["missing_requests"])
recommendations.append(
f"Criar LimitRange para definir requests padrão "
f"({count} containers sem requests)"
)
if "missing_limits" in problems:
count = len(problems["missing_limits"])
recommendations.append(
f"Definir limits para {count} containers para evitar consumo excessivo"
)
if "invalid_ratio" in problems:
count = len(problems["invalid_ratio"])
recommendations.append(
f"Ajustar ratio limit:request para {count} containers"
)
if "overcommit" in problems:
recommendations.append(
"Resolver overcommit de recursos no namespace"
)
return recommendations
async def export_report(
self,
report: ClusterReport,
export_request: ExportRequest
) -> str:
"""Exportar relatório em diferentes formatos"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
if export_request.format == "json":
return await self._export_json(report, timestamp)
elif export_request.format == "csv":
return await self._export_csv(report, timestamp)
elif export_request.format == "pdf":
return await self._export_pdf(report, timestamp)
else:
raise ValueError(f"Formato não suportado: {export_request.format}")
async def _export_json(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em JSON"""
filename = f"cluster_report_{timestamp}.json"
filepath = os.path.join(self.export_path, filename)
# Converter para dict para serialização
report_dict = report.dict()
with open(filepath, 'w', encoding='utf-8') as f:
json.dump(report_dict, f, indent=2, ensure_ascii=False)
logger.info(f"Relatório JSON exportado: {filepath}")
return filepath
async def _export_csv(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em CSV"""
filename = f"cluster_report_{timestamp}.csv"
filepath = os.path.join(self.export_path, filename)
with open(filepath, 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
# Cabeçalho
writer.writerow([
"Pod Name", "Namespace", "Container Name",
"Validation Type", "Severity", "Message", "Recommendation"
])
# Dados das validações
for validation in report.validations:
writer.writerow([
validation.pod_name,
validation.namespace,
validation.container_name,
validation.validation_type,
validation.severity,
validation.message,
validation.recommendation or ""
])
logger.info(f"Relatório CSV exportado: {filepath}")
return filepath
async def _export_pdf(self, report: ClusterReport, timestamp: str) -> str:
"""Exportar relatório em PDF"""
try:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors
filename = f"cluster_report_{timestamp}.pdf"
filepath = os.path.join(self.export_path, filename)
doc = SimpleDocTemplate(filepath, pagesize=letter)
styles = getSampleStyleSheet()
story = []
# Título
title = Paragraph("OpenShift Resource Governance Report", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))
# Resumo
summary_text = f"""
<b>Resumo do Cluster:</b><br/>
Total de Pods: {report.total_pods}<br/>
Total de Namespaces: {report.total_namespaces}<br/>
Total de Nós: {report.total_nodes}<br/>
Total de Validações: {report.summary['total_validations']}<br/>
Problemas Críticos: {report.summary['critical_issues']}<br/>
"""
story.append(Paragraph(summary_text, styles['Normal']))
story.append(Spacer(1, 12))
# Tabela de validações
if report.validations:
data = [["Pod", "Namespace", "Container", "Tipo", "Severidade", "Mensagem"]]
for validation in report.validations[:50]: # Limitar a 50 para PDF
data.append([
validation.pod_name,
validation.namespace,
validation.container_name,
validation.validation_type,
validation.severity,
validation.message[:50] + "..." if len(validation.message) > 50 else validation.message
])
table = Table(data)
table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (-1, 0), colors.grey),
('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
('ALIGN', (0, 0), (-1, -1), 'CENTER'),
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (-1, 0), 14),
('BOTTOMPADDING', (0, 0), (-1, 0), 12),
('BACKGROUND', (0, 1), (-1, -1), colors.beige),
('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(Paragraph("<b>Validações:</b>", styles['Heading2']))
story.append(table)
doc.build(story)
logger.info(f"Relatório PDF exportado: {filepath}")
return filepath
except ImportError:
logger.error("reportlab não instalado. Instale com: pip install reportlab")
raise ValueError("PDF export requer reportlab")
def get_exported_reports(self) -> List[Dict[str, str]]:
"""Listar relatórios exportados"""
reports = []
for filename in os.listdir(self.export_path):
if filename.endswith(('.json', '.csv', '.pdf')):
filepath = os.path.join(self.export_path, filename)
stat = os.stat(filepath)
reports.append({
"filename": filename,
"filepath": filepath,
"size": stat.st_size,
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
"format": filename.split('.')[-1]
})
return sorted(reports, key=lambda x: x["created"], reverse=True)

View File

@@ -0,0 +1,345 @@
"""
Serviço de validação de recursos seguindo best practices Red Hat
"""
import logging
from typing import List, Dict, Any
from decimal import Decimal, InvalidOperation
import re
from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources
from app.core.config import settings
logger = logging.getLogger(__name__)
class ValidationService:
"""Serviço para validação de recursos"""
def __init__(self):
self.cpu_ratio = settings.cpu_limit_ratio
self.memory_ratio = settings.memory_limit_ratio
self.min_cpu_request = settings.min_cpu_request
self.min_memory_request = settings.min_memory_request
def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]:
"""Validar recursos de um pod"""
validations = []
for container in pod.containers:
container_validations = self._validate_container_resources(
pod.name, pod.namespace, container
)
validations.extend(container_validations)
return validations
def _validate_container_resources(
self,
pod_name: str,
namespace: str,
container: Dict[str, Any]
) -> List[ResourceValidation]:
"""Validar recursos de um container"""
validations = []
resources = container.get("resources", {})
requests = resources.get("requests", {})
limits = resources.get("limits", {})
# 1. Verificar se requests estão definidos
if not requests:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container["name"],
validation_type="missing_requests",
severity="error",
message="Container sem requests definidos",
recommendation="Definir requests de CPU e memória para garantir QoS"
))
# 2. Verificar se limits estão definidos
if not limits:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container["name"],
validation_type="missing_limits",
severity="warning",
message="Container sem limits definidos",
recommendation="Definir limits para evitar consumo excessivo de recursos"
))
# 3. Validar ratio limit:request
if requests and limits:
cpu_validation = self._validate_cpu_ratio(
pod_name, namespace, container["name"], requests, limits
)
if cpu_validation:
validations.append(cpu_validation)
memory_validation = self._validate_memory_ratio(
pod_name, namespace, container["name"], requests, limits
)
if memory_validation:
validations.append(memory_validation)
# 4. Validar valores mínimos
if requests:
min_validation = self._validate_minimum_values(
pod_name, namespace, container["name"], requests
)
validations.extend(min_validation)
return validations
def _validate_cpu_ratio(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str],
limits: Dict[str, str]
) -> ResourceValidation:
"""Validar ratio CPU limit:request"""
if "cpu" not in requests or "cpu" not in limits:
return None
try:
request_value = self._parse_cpu_value(requests["cpu"])
limit_value = self._parse_cpu_value(limits["cpu"])
if request_value > 0:
ratio = limit_value / request_value
if ratio > self.cpu_ratio * 1.5: # 50% de tolerância
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="warning",
message=f"Ratio CPU limit:request muito alto ({ratio:.2f}:1)",
recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.cpu_ratio}:1)"
)
elif ratio < 1.0:
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="error",
message=f"CPU limit menor que request ({ratio:.2f}:1)",
recommendation="CPU limit deve ser maior ou igual ao request"
)
except (ValueError, InvalidOperation) as e:
logger.warning(f"Erro ao validar ratio CPU: {e}")
return None
def _validate_memory_ratio(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str],
limits: Dict[str, str]
) -> ResourceValidation:
"""Validar ratio memória limit:request"""
if "memory" not in requests or "memory" not in limits:
return None
try:
request_value = self._parse_memory_value(requests["memory"])
limit_value = self._parse_memory_value(limits["memory"])
if request_value > 0:
ratio = limit_value / request_value
if ratio > self.memory_ratio * 1.5: # 50% de tolerância
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="warning",
message=f"Ratio memória limit:request muito alto ({ratio:.2f}:1)",
recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.memory_ratio}:1)"
)
elif ratio < 1.0:
return ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="invalid_ratio",
severity="error",
message=f"Memória limit menor que request ({ratio:.2f}:1)",
recommendation="Memória limit deve ser maior ou igual ao request"
)
except (ValueError, InvalidOperation) as e:
logger.warning(f"Erro ao validar ratio memória: {e}")
return None
def _validate_minimum_values(
self,
pod_name: str,
namespace: str,
container_name: str,
requests: Dict[str, str]
) -> List[ResourceValidation]:
"""Validar valores mínimos de requests"""
validations = []
# Validar CPU mínima
if "cpu" in requests:
try:
request_value = self._parse_cpu_value(requests["cpu"])
min_value = self._parse_cpu_value(self.min_cpu_request)
if request_value < min_value:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="minimum_value",
severity="warning",
message=f"CPU request muito baixo ({requests['cpu']})",
recommendation=f"Considerar aumentar para pelo menos {self.min_cpu_request}"
))
except (ValueError, InvalidOperation):
pass
# Validar memória mínima
if "memory" in requests:
try:
request_value = self._parse_memory_value(requests["memory"])
min_value = self._parse_memory_value(self.min_memory_request)
if request_value < min_value:
validations.append(ResourceValidation(
pod_name=pod_name,
namespace=namespace,
container_name=container_name,
validation_type="minimum_value",
severity="warning",
message=f"Memória request muito baixa ({requests['memory']})",
recommendation=f"Considerar aumentar para pelo menos {self.min_memory_request}"
))
except (ValueError, InvalidOperation):
pass
return validations
def _parse_cpu_value(self, value: str) -> float:
"""Converter valor de CPU para float (cores)"""
if value.endswith('m'):
return float(value[:-1]) / 1000
elif value.endswith('n'):
return float(value[:-1]) / 1000000000
else:
return float(value)
def _parse_memory_value(self, value: str) -> int:
"""Converter valor de memória para bytes"""
value = value.upper()
if value.endswith('KI'):
return int(float(value[:-2]) * 1024)
elif value.endswith('MI'):
return int(float(value[:-2]) * 1024 * 1024)
elif value.endswith('GI'):
return int(float(value[:-2]) * 1024 * 1024 * 1024)
elif value.endswith('K'):
return int(float(value[:-1]) * 1000)
elif value.endswith('M'):
return int(float(value[:-1]) * 1000 * 1000)
elif value.endswith('G'):
return int(float(value[:-1]) * 1000 * 1000 * 1000)
else:
return int(value)
def validate_namespace_overcommit(
self,
namespace_resources: NamespaceResources,
node_capacity: Dict[str, str]
) -> List[ResourceValidation]:
"""Validar overcommit em um namespace"""
validations = []
# Calcular total de requests do namespace
total_cpu_requests = self._parse_cpu_value(namespace_resources.total_cpu_requests)
total_memory_requests = self._parse_memory_value(namespace_resources.total_memory_requests)
# Calcular capacidade total dos nós
total_cpu_capacity = self._parse_cpu_value(node_capacity.get("cpu", "0"))
total_memory_capacity = self._parse_memory_value(node_capacity.get("memory", "0"))
# Verificar overcommit de CPU
if total_cpu_capacity > 0:
cpu_utilization = (total_cpu_requests / total_cpu_capacity) * 100
if cpu_utilization > 100:
validations.append(ResourceValidation(
pod_name="namespace",
namespace=namespace_resources.name,
container_name="all",
validation_type="overcommit",
severity="critical",
message=f"Overcommit de CPU no namespace: {cpu_utilization:.1f}%",
recommendation="Reduzir requests de CPU ou adicionar mais nós ao cluster"
))
# Verificar overcommit de memória
if total_memory_capacity > 0:
memory_utilization = (total_memory_requests / total_memory_capacity) * 100
if memory_utilization > 100:
validations.append(ResourceValidation(
pod_name="namespace",
namespace=namespace_resources.name,
container_name="all",
validation_type="overcommit",
severity="critical",
message=f"Overcommit de memória no namespace: {memory_utilization:.1f}%",
recommendation="Reduzir requests de memória ou adicionar mais nós ao cluster"
))
return validations
def generate_recommendations(self, validations: List[ResourceValidation]) -> List[str]:
"""Gerar recomendações baseadas nas validações"""
recommendations = []
# Agrupar validações por tipo
validation_counts = {}
for validation in validations:
validation_type = validation.validation_type
if validation_type not in validation_counts:
validation_counts[validation_type] = 0
validation_counts[validation_type] += 1
# Gerar recomendações baseadas nos problemas encontrados
if validation_counts.get("missing_requests", 0) > 0:
recommendations.append(
f"Implementar LimitRange no namespace para definir requests padrão "
f"({validation_counts['missing_requests']} containers sem requests)"
)
if validation_counts.get("missing_limits", 0) > 0:
recommendations.append(
f"Definir limits para {validation_counts['missing_limits']} containers "
"para evitar consumo excessivo de recursos"
)
if validation_counts.get("invalid_ratio", 0) > 0:
recommendations.append(
f"Ajustar ratio limit:request para {validation_counts['invalid_ratio']} containers "
f"(recomendado: {self.cpu_ratio}:1)"
)
if validation_counts.get("overcommit", 0) > 0:
recommendations.append(
f"Resolver overcommit em {validation_counts['overcommit']} namespaces "
"para evitar problemas de performance"
)
return recommendations

530
app/static/index.html Normal file
View File

@@ -0,0 +1,530 @@
<!DOCTYPE html>
<html lang="pt-BR">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenShift Resource Governance Tool</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background-color: #f5f5f5;
color: #333;
}
.header {
background: linear-gradient(135deg, #cc0000, #8b0000);
color: white;
padding: 1rem 2rem;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.header h1 {
font-size: 1.8rem;
font-weight: 600;
}
.header p {
margin-top: 0.5rem;
opacity: 0.9;
}
.container {
max-width: 1200px;
margin: 0 auto;
padding: 2rem;
}
.card {
background: white;
border-radius: 8px;
padding: 1.5rem;
margin-bottom: 1.5rem;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.card h2 {
color: #cc0000;
margin-bottom: 1rem;
font-size: 1.3rem;
}
.stats-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin-bottom: 2rem;
}
.stat-card {
background: white;
padding: 1.5rem;
border-radius: 8px;
text-align: center;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}
.stat-number {
font-size: 2rem;
font-weight: bold;
color: #cc0000;
}
.stat-label {
color: #666;
margin-top: 0.5rem;
}
.btn {
background: #cc0000;
color: white;
border: none;
padding: 0.75rem 1.5rem;
border-radius: 4px;
cursor: pointer;
font-size: 1rem;
transition: background-color 0.2s;
}
.btn:hover {
background: #8b0000;
}
.btn:disabled {
background: #ccc;
cursor: not-allowed;
}
.btn-secondary {
background: #6c757d;
}
.btn-secondary:hover {
background: #545b62;
}
.loading {
text-align: center;
padding: 2rem;
color: #666;
}
.error {
background: #f8d7da;
color: #721c24;
padding: 1rem;
border-radius: 4px;
margin: 1rem 0;
}
.success {
background: #d4edda;
color: #155724;
padding: 1rem;
border-radius: 4px;
margin: 1rem 0;
}
.validation-item {
padding: 1rem;
border-left: 4px solid #ccc;
margin: 0.5rem 0;
background: #f8f9fa;
}
.validation-item.error {
border-left-color: #dc3545;
background: #f8d7da;
}
.validation-item.warning {
border-left-color: #ffc107;
background: #fff3cd;
}
.validation-item.critical {
border-left-color: #dc3545;
background: #f8d7da;
font-weight: bold;
}
.validation-header {
font-weight: bold;
margin-bottom: 0.5rem;
}
.validation-message {
color: #666;
margin-bottom: 0.5rem;
}
.validation-recommendation {
font-style: italic;
color: #007bff;
}
.export-section {
display: flex;
gap: 1rem;
align-items: center;
flex-wrap: wrap;
}
.export-section select,
.export-section input {
padding: 0.5rem;
border: 1px solid #ddd;
border-radius: 4px;
}
.table {
width: 100%;
border-collapse: collapse;
margin-top: 1rem;
}
.table th,
.table td {
padding: 0.75rem;
text-align: left;
border-bottom: 1px solid #ddd;
}
.table th {
background: #f8f9fa;
font-weight: 600;
}
.severity-badge {
padding: 0.25rem 0.5rem;
border-radius: 12px;
font-size: 0.8rem;
font-weight: bold;
}
.severity-error {
background: #f8d7da;
color: #721c24;
}
.severity-warning {
background: #fff3cd;
color: #856404;
}
.severity-critical {
background: #f8d7da;
color: #721c24;
}
.hidden {
display: none;
}
@media (max-width: 768px) {
.container {
padding: 1rem;
}
.stats-grid {
grid-template-columns: 1fr;
}
.export-section {
flex-direction: column;
align-items: stretch;
}
}
</style>
</head>
<body>
<div class="header">
<h1>OpenShift Resource Governance Tool</h1>
<p>Ferramenta de governança de recursos para clusters OpenShift</p>
</div>
<div class="container">
<!-- Estatísticas do Cluster -->
<div class="stats-grid" id="statsGrid">
<div class="stat-card">
<div class="stat-number" id="totalPods">-</div>
<div class="stat-label">Total de Pods</div>
</div>
<div class="stat-card">
<div class="stat-number" id="totalNamespaces">-</div>
<div class="stat-label">Namespaces</div>
</div>
<div class="stat-card">
<div class="stat-number" id="totalNodes">-</div>
<div class="stat-label">Nós</div>
</div>
<div class="stat-card">
<div class="stat-number" id="criticalIssues">-</div>
<div class="stat-label">Problemas Críticos</div>
</div>
</div>
<!-- Controles -->
<div class="card">
<h2>Controles</h2>
<div style="display: flex; gap: 1rem; flex-wrap: wrap;">
<button class="btn" onclick="loadClusterStatus()">Atualizar Status</button>
<button class="btn btn-secondary" onclick="loadValidations()">Ver Validações</button>
<button class="btn btn-secondary" onclick="loadVPARecommendations()">Ver VPA</button>
</div>
</div>
<!-- Exportar Relatórios -->
<div class="card">
<h2>Exportar Relatórios</h2>
<div class="export-section">
<select id="exportFormat">
<option value="json">JSON</option>
<option value="csv">CSV</option>
<option value="pdf">PDF</option>
</select>
<input type="text" id="namespaces" placeholder="Namespaces (opcional, separados por vírgula)">
<label>
<input type="checkbox" id="includeVPA" checked> Incluir VPA
</label>
<label>
<input type="checkbox" id="includeValidations" checked> Incluir Validações
</label>
<button class="btn" onclick="exportReport()">Exportar</button>
</div>
</div>
<!-- Validações -->
<div class="card" id="validationsCard" style="display: none;">
<h2>Validações de Recursos</h2>
<div id="validationsList"></div>
</div>
<!-- Recomendações VPA -->
<div class="card" id="vpaCard" style="display: none;">
<h2>Recomendações VPA</h2>
<div id="vpaList"></div>
</div>
<!-- Loading -->
<div class="loading hidden" id="loading">
<p>Carregando dados...</p>
</div>
<!-- Error -->
<div class="error hidden" id="error"></div>
<!-- Success -->
<div class="success hidden" id="success"></div>
</div>
<script>
let currentData = null;
// Carregar status inicial
document.addEventListener('DOMContentLoaded', function() {
loadClusterStatus();
});
async function loadClusterStatus() {
showLoading();
hideMessages();
try {
const response = await fetch('/api/v1/cluster/status');
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const data = await response.json();
currentData = data;
updateStats(data);
showSuccess('Status do cluster carregado com sucesso');
} catch (error) {
showError('Erro ao carregar status do cluster: ' + error.message);
} finally {
hideLoading();
}
}
async function loadValidations() {
if (!currentData) {
showError('Carregue o status do cluster primeiro');
return;
}
showLoading();
try {
const response = await fetch('/api/v1/validations');
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const validations = await response.json();
displayValidations(validations);
document.getElementById('validationsCard').style.display = 'block';
} catch (error) {
showError('Erro ao carregar validações: ' + error.message);
} finally {
hideLoading();
}
}
async function loadVPARecommendations() {
showLoading();
try {
const response = await fetch('/api/v1/vpa/recommendations');
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const recommendations = await response.json();
displayVPARecommendations(recommendations);
document.getElementById('vpaCard').style.display = 'block';
} catch (error) {
showError('Erro ao carregar recomendações VPA: ' + error.message);
} finally {
hideLoading();
}
}
async function exportReport() {
showLoading();
try {
const format = document.getElementById('exportFormat').value;
const namespaces = document.getElementById('namespaces').value;
const includeVPA = document.getElementById('includeVPA').checked;
const includeValidations = document.getElementById('includeValidations').checked;
const requestBody = {
format: format,
includeVPA: includeVPA,
includeValidations: includeValidations
};
if (namespaces.trim()) {
requestBody.namespaces = namespaces.split(',').map(n => n.trim());
}
const response = await fetch('/api/v1/export', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const result = await response.json();
showSuccess(`Relatório exportado: ${result.filepath}`);
} catch (error) {
showError('Erro ao exportar relatório: ' + error.message);
} finally {
hideLoading();
}
}
function updateStats(data) {
document.getElementById('totalPods').textContent = data.total_pods || 0;
document.getElementById('totalNamespaces').textContent = data.total_namespaces || 0;
document.getElementById('totalNodes').textContent = data.total_nodes || 0;
document.getElementById('criticalIssues').textContent = data.summary?.critical_issues || 0;
}
function displayValidations(validations) {
const container = document.getElementById('validationsList');
if (validations.length === 0) {
container.innerHTML = '<p>Nenhuma validação encontrada.</p>';
return;
}
let html = '<table class="table"><thead><tr><th>Pod</th><th>Namespace</th><th>Container</th><th>Tipo</th><th>Severidade</th><th>Mensagem</th></tr></thead><tbody>';
validations.forEach(validation => {
const severityClass = `severity-${validation.severity}`;
html += `
<tr>
<td>${validation.pod_name}</td>
<td>${validation.namespace}</td>
<td>${validation.container_name}</td>
<td>${validation.validation_type}</td>
<td><span class="severity-badge ${severityClass}">${validation.severity}</span></td>
<td>${validation.message}</td>
</tr>
`;
});
html += '</tbody></table>';
container.innerHTML = html;
}
function displayVPARecommendations(recommendations) {
const container = document.getElementById('vpaList');
if (recommendations.length === 0) {
container.innerHTML = '<p>Nenhuma recomendação VPA encontrada.</p>';
return;
}
let html = '<table class="table"><thead><tr><th>Nome</th><th>Namespace</th><th>Target</th><th>Recomendações</th></tr></thead><tbody>';
recommendations.forEach(rec => {
html += `
<tr>
<td>${rec.name}</td>
<td>${rec.namespace}</td>
<td>${rec.target_ref?.kind}/${rec.target_ref?.name || 'N/A'}</td>
<td>${JSON.stringify(rec.recommendations, null, 2)}</td>
</tr>
`;
});
html += '</tbody></table>';
container.innerHTML = html;
}
function showLoading() {
document.getElementById('loading').classList.remove('hidden');
}
function hideLoading() {
document.getElementById('loading').classList.add('hidden');
}
function showError(message) {
const errorDiv = document.getElementById('error');
errorDiv.textContent = message;
errorDiv.classList.remove('hidden');
setTimeout(() => errorDiv.classList.add('hidden'), 5000);
}
function showSuccess(message) {
const successDiv = document.getElementById('success');
successDiv.textContent = message;
successDiv.classList.remove('hidden');
setTimeout(() => successDiv.classList.add('hidden'), 3000);
}
function hideMessages() {
document.getElementById('error').classList.add('hidden');
document.getElementById('success').classList.add('hidden');
}
</script>
</body>
</html>

32
k8s/configmap.yaml Normal file
View File

@@ -0,0 +1,32 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: resource-governance-config
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
data:
# Configurações da aplicação
CPU_LIMIT_RATIO: "3.0"
MEMORY_LIMIT_RATIO: "3.0"
MIN_CPU_REQUEST: "10m"
MIN_MEMORY_REQUEST: "32Mi"
# Namespaces críticos para VPA
CRITICAL_NAMESPACES: |
openshift-monitoring
openshift-ingress
openshift-apiserver
openshift-controller-manager
openshift-sdn
# URL do Prometheus
PROMETHEUS_URL: "http://prometheus.openshift-monitoring.svc.cluster.local:9090"
# Configurações de relatório
REPORT_EXPORT_PATH: "/tmp/reports"
# Configurações de segurança
ENABLE_RBAC: "true"
SERVICE_ACCOUNT_NAME: "resource-governance-sa"

122
k8s/daemonset.yaml Normal file
View File

@@ -0,0 +1,122 @@
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: resource-governance
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
spec:
selector:
matchLabels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
template:
metadata:
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
spec:
serviceAccountName: resource-governance-sa
securityContext:
runAsNonRoot: true
runAsUser: 1000
fsGroup: 1000
containers:
- name: resource-governance
image: resource-governance:latest
imagePullPolicy: Always
ports:
- containerPort: 8080
name: http
protocol: TCP
env:
- name: KUBECONFIG
value: "/var/run/secrets/kubernetes.io/serviceaccount/token"
- name: CPU_LIMIT_RATIO
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: CPU_LIMIT_RATIO
- name: MEMORY_LIMIT_RATIO
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: MEMORY_LIMIT_RATIO
- name: MIN_CPU_REQUEST
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: MIN_CPU_REQUEST
- name: MIN_MEMORY_REQUEST
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: MIN_MEMORY_REQUEST
- name: CRITICAL_NAMESPACES
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: CRITICAL_NAMESPACES
- name: PROMETHEUS_URL
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: PROMETHEUS_URL
- name: REPORT_EXPORT_PATH
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: REPORT_EXPORT_PATH
- name: ENABLE_RBAC
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: ENABLE_RBAC
- name: SERVICE_ACCOUNT_NAME
valueFrom:
configMapKeyRef:
name: resource-governance-config
key: SERVICE_ACCOUNT_NAME
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
livenessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /health
port: 8080
initialDelaySeconds: 5
periodSeconds: 5
timeoutSeconds: 3
failureThreshold: 3
volumeMounts:
- name: reports-volume
mountPath: /tmp/reports
- name: tmp-volume
mountPath: /tmp
volumes:
- name: reports-volume
emptyDir: {}
- name: tmp-volume
emptyDir: {}
nodeSelector:
kubernetes.io/os: linux
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule

19
k8s/kustomization.yaml Normal file
View File

@@ -0,0 +1,19 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- rbac.yaml
- configmap.yaml
- daemonset.yaml
- service.yaml
- route.yaml
commonLabels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
app.kubernetes.io/part-of: openshift-governance
images:
- name: resource-governance
newTag: latest

36
k8s/namespace.yaml Normal file
View File

@@ -0,0 +1,36 @@
apiVersion: v1
kind: Namespace
metadata:
name: resource-governance
labels:
name: resource-governance
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
---
apiVersion: v1
kind: ResourceQuota
metadata:
name: resource-governance-quota
namespace: resource-governance
spec:
hard:
requests.cpu: "2"
requests.memory: 4Gi
limits.cpu: "4"
limits.memory: 8Gi
pods: "10"
---
apiVersion: v1
kind: LimitRange
metadata:
name: resource-governance-limits
namespace: resource-governance
spec:
limits:
- default:
cpu: "500m"
memory: "512Mi"
defaultRequest:
cpu: "100m"
memory: "128Mi"
type: Container

93
k8s/rbac.yaml Normal file
View File

@@ -0,0 +1,93 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: resource-governance-sa
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: resource-governance-role
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
rules:
# Permissões para listar e ler pods
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
# Permissões para listar e ler namespaces
- apiGroups: [""]
resources: ["namespaces"]
verbs: ["get", "list", "watch"]
# Permissões para listar e ler nós
- apiGroups: [""]
resources: ["nodes"]
verbs: ["get", "list", "watch"]
# Permissões para VPA (Vertical Pod Autoscaler)
- apiGroups: ["autoscaling.k8s.io"]
resources: ["verticalpodautoscalers"]
verbs: ["get", "list", "watch"]
# Permissões para deployments e replicasets (para aplicar recomendações)
- apiGroups: ["apps"]
resources: ["deployments", "replicasets"]
verbs: ["get", "list", "watch", "patch", "update"]
# Permissões para pods (para aplicar recomendações)
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch", "patch", "update"]
# Permissões para eventos (para logging)
- apiGroups: [""]
resources: ["events"]
verbs: ["get", "list", "watch", "create"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: resource-governance-binding
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: resource-governance-role
subjects:
- kind: ServiceAccount
name: resource-governance-sa
namespace: resource-governance
---
# Role para acessar recursos do Prometheus (se necessário)
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: resource-governance-prometheus-role
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
rules:
# Permissões para acessar serviços do Prometheus
- apiGroups: [""]
resources: ["services", "endpoints"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: resource-governance-prometheus-binding
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: resource-governance-prometheus-role
subjects:
- kind: ServiceAccount
name: resource-governance-sa
namespace: resource-governance

23
k8s/route.yaml Normal file
View File

@@ -0,0 +1,23 @@
apiVersion: route.openshift.io/v1
kind: Route
metadata:
name: resource-governance-route
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
annotations:
haproxy.router.openshift.io/timeout: "300s"
haproxy.router.openshift.io/rate-limit: "100"
spec:
host: resource-governance.apps.openshift.local
to:
kind: Service
name: resource-governance-service
weight: 100
port:
targetPort: http
tls:
termination: edge
insecureEdgeTerminationPolicy: Redirect
wildcardPolicy: None

18
k8s/service.yaml Normal file
View File

@@ -0,0 +1,18 @@
apiVersion: v1
kind: Service
metadata:
name: resource-governance-service
namespace: resource-governance
labels:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: 8080
protocol: TCP
name: http
selector:
app.kubernetes.io/name: resource-governance
app.kubernetes.io/component: governance

14
requirements.txt Normal file
View File

@@ -0,0 +1,14 @@
fastapi==0.104.1
uvicorn==0.24.0
kubernetes==28.1.0
prometheus-client==0.19.0
requests==2.31.0
pydantic==2.5.0
python-multipart==0.0.6
jinja2==3.1.2
aiofiles==23.2.1
pandas==2.1.4
reportlab==4.0.7
python-jose[cryptography]==3.3.0
passlib[bcrypt]==1.7.4
python-dotenv==1.0.0

58
scripts/build.sh Executable file
View File

@@ -0,0 +1,58 @@
#!/bin/bash
# Script de build para OpenShift Resource Governance Tool
set -e
# Cores para output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configurações
IMAGE_NAME="resource-governance"
TAG="${1:-latest}"
REGISTRY="${2:-quay.io/openshift}"
FULL_IMAGE_NAME="${REGISTRY}/${IMAGE_NAME}:${TAG}"
echo -e "${BLUE}🚀 Building OpenShift Resource Governance Tool${NC}"
echo -e "${BLUE}Image: ${FULL_IMAGE_NAME}${NC}"
# Verificar se Docker está rodando
if ! docker info > /dev/null 2>&1; then
echo -e "${RED}❌ Docker não está rodando. Inicie o Docker e tente novamente.${NC}"
exit 1
fi
# Build da imagem
echo -e "${YELLOW}📦 Building Docker image...${NC}"
docker build -t "${FULL_IMAGE_NAME}" .
if [ $? -eq 0 ]; then
echo -e "${GREEN}✅ Image built successfully!${NC}"
else
echo -e "${RED}❌ Build failed!${NC}"
exit 1
fi
# Testar a imagem
echo -e "${YELLOW}🧪 Testing image...${NC}"
docker run --rm "${FULL_IMAGE_NAME}" python -c "import app.main; print('✅ App imports successfully')"
if [ $? -eq 0 ]; then
echo -e "${GREEN}✅ Image test passed!${NC}"
else
echo -e "${RED}❌ Image test failed!${NC}"
exit 1
fi
# Mostrar informações da imagem
echo -e "${BLUE}📊 Image information:${NC}"
docker images "${FULL_IMAGE_NAME}"
echo -e "${GREEN}🎉 Build completed successfully!${NC}"
echo -e "${BLUE}To push to registry:${NC}"
echo -e " docker push ${FULL_IMAGE_NAME}"
echo -e "${BLUE}To run locally:${NC}"
echo -e " docker run -p 8080:8080 ${FULL_IMAGE_NAME}"

90
scripts/deploy.sh Executable file
View File

@@ -0,0 +1,90 @@
#!/bin/bash
# Script de deploy para OpenShift Resource Governance Tool
set -e
# Cores para output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configurações
NAMESPACE="resource-governance"
IMAGE_NAME="resource-governance"
TAG="${1:-latest}"
REGISTRY="${2:-quay.io/openshift}"
FULL_IMAGE_NAME="${REGISTRY}/${IMAGE_NAME}:${TAG}"
echo -e "${BLUE}🚀 Deploying OpenShift Resource Governance Tool${NC}"
echo -e "${BLUE}Namespace: ${NAMESPACE}${NC}"
echo -e "${BLUE}Image: ${FULL_IMAGE_NAME}${NC}"
# Verificar se oc está instalado
if ! command -v oc &> /dev/null; then
echo -e "${RED}❌ OpenShift CLI (oc) não está instalado.${NC}"
echo -e "${YELLOW}Instale o oc CLI: https://docs.openshift.com/container-platform/latest/cli_reference/openshift_cli/getting-started-cli.html${NC}"
exit 1
fi
# Verificar se está logado no OpenShift
if ! oc whoami &> /dev/null; then
echo -e "${RED}❌ Não está logado no OpenShift.${NC}"
echo -e "${YELLOW}Faça login com: oc login <cluster-url>${NC}"
exit 1
fi
echo -e "${GREEN}✅ Logado como: $(oc whoami)${NC}"
# Criar namespace se não existir
echo -e "${YELLOW}📁 Creating namespace...${NC}"
oc apply -f k8s/namespace.yaml
# Aplicar RBAC
echo -e "${YELLOW}🔐 Applying RBAC...${NC}"
oc apply -f k8s/rbac.yaml
# Aplicar ConfigMap
echo -e "${YELLOW}⚙️ Applying ConfigMap...${NC}"
oc apply -f k8s/configmap.yaml
# Atualizar imagem no DaemonSet
echo -e "${YELLOW}🔄 Updating image in DaemonSet...${NC}"
oc set image daemonset/resource-governance resource-governance="${FULL_IMAGE_NAME}" -n "${NAMESPACE}"
# Aplicar DaemonSet
echo -e "${YELLOW}📦 Applying DaemonSet...${NC}"
oc apply -f k8s/daemonset.yaml
# Aplicar Service
echo -e "${YELLOW}🌐 Applying Service...${NC}"
oc apply -f k8s/service.yaml
# Aplicar Route
echo -e "${YELLOW}🛣️ Applying Route...${NC}"
oc apply -f k8s/route.yaml
# Aguardar pods ficarem prontos
echo -e "${YELLOW}⏳ Waiting for pods to be ready...${NC}"
oc wait --for=condition=ready pod -l app.kubernetes.io/name=resource-governance -n "${NAMESPACE}" --timeout=300s
# Obter URL da rota
ROUTE_URL=$(oc get route resource-governance-route -n "${NAMESPACE}" -o jsonpath='{.spec.host}')
if [ -n "${ROUTE_URL}" ]; then
echo -e "${GREEN}🎉 Deploy completed successfully!${NC}"
echo -e "${BLUE}🌐 Application URL: https://${ROUTE_URL}${NC}"
else
echo -e "${YELLOW}⚠️ Deploy completed, but route URL not found.${NC}"
echo -e "${BLUE}Check with: oc get routes -n ${NAMESPACE}${NC}"
fi
# Mostrar status
echo -e "${BLUE}📊 Deployment status:${NC}"
oc get all -n "${NAMESPACE}"
echo -e "${BLUE}🔍 To check logs:${NC}"
echo -e " oc logs -f daemonset/resource-governance -n ${NAMESPACE}"
echo -e "${BLUE}🧪 To test health:${NC}"
echo -e " curl https://${ROUTE_URL}/health"

81
scripts/undeploy.sh Executable file
View File

@@ -0,0 +1,81 @@
#!/bin/bash
# Script de undeploy para OpenShift Resource Governance Tool
set -e
# Cores para output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configurações
NAMESPACE="resource-governance"
echo -e "${BLUE}🗑️ Undeploying OpenShift Resource Governance Tool${NC}"
echo -e "${BLUE}Namespace: ${NAMESPACE}${NC}"
# Verificar se oc está instalado
if ! command -v oc &> /dev/null; then
echo -e "${RED}❌ OpenShift CLI (oc) não está instalado.${NC}"
exit 1
fi
# Verificar se está logado no OpenShift
if ! oc whoami &> /dev/null; then
echo -e "${RED}❌ Não está logado no OpenShift.${NC}"
echo -e "${YELLOW}Faça login com: oc login <cluster-url>${NC}"
exit 1
fi
echo -e "${GREEN}✅ Logado como: $(oc whoami)${NC}"
# Confirmar remoção
read -p "Tem certeza que deseja remover a aplicação? (y/N): " -n 1 -r
echo
if [[ ! $REPLY =~ ^[Yy]$ ]]; then
echo -e "${YELLOW}❌ Operação cancelada.${NC}"
exit 0
fi
# Remover Route
echo -e "${YELLOW}🛣️ Removing Route...${NC}"
oc delete -f k8s/route.yaml --ignore-not-found=true
# Remover Service
echo -e "${YELLOW}🌐 Removing Service...${NC}"
oc delete -f k8s/service.yaml --ignore-not-found=true
# Remover DaemonSet
echo -e "${YELLOW}📦 Removing DaemonSet...${NC}"
oc delete -f k8s/daemonset.yaml --ignore-not-found=true
# Aguardar pods serem removidos
echo -e "${YELLOW}⏳ Waiting for pods to be terminated...${NC}"
oc wait --for=delete pod -l app.kubernetes.io/name=resource-governance -n "${NAMESPACE}" --timeout=60s || true
# Remover ConfigMap
echo -e "${YELLOW}⚙️ Removing ConfigMap...${NC}"
oc delete -f k8s/configmap.yaml --ignore-not-found=true
# Remover RBAC
echo -e "${YELLOW}🔐 Removing RBAC...${NC}"
oc delete -f k8s/rbac.yaml --ignore-not-found=true
# Remover namespace (opcional)
read -p "Deseja remover o namespace também? (y/N): " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
echo -e "${YELLOW}📁 Removing namespace...${NC}"
oc delete -f k8s/namespace.yaml --ignore-not-found=true
echo -e "${GREEN}✅ Namespace removed.${NC}"
else
echo -e "${YELLOW}⚠️ Namespace mantido.${NC}"
fi
echo -e "${GREEN}🎉 Undeploy completed successfully!${NC}"
# Verificar se ainda há recursos
echo -e "${BLUE}🔍 Checking remaining resources:${NC}"
oc get all -n "${NAMESPACE}" 2>/dev/null || echo -e "${GREEN}✅ No resources found in namespace.${NC}"

67
setup.sh Executable file
View File

@@ -0,0 +1,67 @@
#!/bin/bash
# Script de setup para OpenShift Resource Governance Tool
set -e
# Cores para output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
echo -e "${BLUE}🚀 Setting up OpenShift Resource Governance Tool${NC}"
# Verificar se Python está instalado
if ! command -v python3 &> /dev/null; then
echo -e "${RED}❌ Python 3 não está instalado.${NC}"
echo -e "${YELLOW}Instale Python 3.11+ e tente novamente.${NC}"
exit 1
fi
# Verificar se pip está instalado
if ! command -v pip3 &> /dev/null; then
echo -e "${RED}❌ pip3 não está instalado.${NC}"
echo -e "${YELLOW}Instale pip3 e tente novamente.${NC}"
exit 1
fi
# Instalar dependências Python
echo -e "${YELLOW}📦 Installing Python dependencies...${NC}"
pip3 install -r requirements.txt
# Tornar scripts executáveis
echo -e "${YELLOW}🔧 Making scripts executable...${NC}"
chmod +x scripts/*.sh
# Criar diretório de relatórios
echo -e "${YELLOW}📁 Creating reports directory...${NC}"
mkdir -p reports
# Verificar se Docker está instalado
if command -v docker &> /dev/null; then
echo -e "${GREEN}✅ Docker encontrado${NC}"
else
echo -e "${YELLOW}⚠️ Docker não encontrado. Instale para fazer build da imagem.${NC}"
fi
# Verificar se oc está instalado
if command -v oc &> /dev/null; then
echo -e "${GREEN}✅ OpenShift CLI (oc) encontrado${NC}"
else
echo -e "${YELLOW}⚠️ OpenShift CLI (oc) não encontrado. Instale para fazer deploy.${NC}"
fi
echo -e "${GREEN}🎉 Setup completed successfully!${NC}"
echo ""
echo -e "${BLUE}Próximos passos:${NC}"
echo -e "1. ${YELLOW}Desenvolvimento local:${NC} make dev"
echo -e "2. ${YELLOW}Build da imagem:${NC} make build"
echo -e "3. ${YELLOW}Deploy no OpenShift:${NC} make deploy"
echo -e "4. ${YELLOW}Ver documentação:${NC} cat README.md"
echo ""
echo -e "${BLUE}Comandos úteis:${NC}"
echo -e " make help - Mostrar todos os comandos"
echo -e " make test - Executar testes"
echo -e " make logs - Ver logs da aplicação"
echo -e " make status - Ver status da aplicação"