commit 4d60c0e039fe5072a31f06a10f2b7fd39a8ce78b Author: andersonid Date: Thu Sep 25 14:26:24 2025 -0300 Initial commit: OpenShift Resource Governance Tool - Implementa ferramenta completa de governança de recursos - Backend Python com FastAPI para coleta de dados - Validações seguindo best practices Red Hat - Integração com Prometheus e VPA - UI web interativa para visualização - Relatórios em JSON, CSV e PDF - Deploy como DaemonSet com RBAC - Scripts de automação para build e deploy diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f6aa1d6 --- /dev/null +++ b/.env.example @@ -0,0 +1,23 @@ +# Configurações do OpenShift/Kubernetes +KUBECONFIG_PATH= +CLUSTER_URL= +TOKEN= + +# Configurações do Prometheus +PROMETHEUS_URL=http://prometheus.openshift-monitoring.svc.cluster.local:9090 + +# Configurações de validação +CPU_LIMIT_RATIO=3.0 +MEMORY_LIMIT_RATIO=3.0 +MIN_CPU_REQUEST=10m +MIN_MEMORY_REQUEST=32Mi + +# Namespaces críticos para VPA (separados por vírgula) +CRITICAL_NAMESPACES=openshift-monitoring,openshift-ingress,openshift-apiserver,openshift-controller-manager,openshift-sdn + +# Configurações de relatório +REPORT_EXPORT_PATH=/tmp/reports + +# Configurações de segurança +ENABLE_RBAC=true +SERVICE_ACCOUNT_NAME=resource-governance-sa diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3234515 --- /dev/null +++ b/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Project specific +reports/ +*.json +*.csv +*.pdf +logs/ +temp/ +tmp/ + +# Kubernetes +kubeconfig +*.kubeconfig + +# Docker +.dockerignore diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5f20c86 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,58 @@ +# Multi-stage build para otimizar tamanho da imagem +FROM python:3.11-slim as builder + +# Instalar dependências do sistema necessárias para compilação +RUN apt-get update && apt-get install -y \ + gcc \ + g++ \ + && rm -rf /var/lib/apt/lists/* + +# Criar diretório de trabalho +WORKDIR /app + +# Copiar requirements e instalar dependências Python +COPY requirements.txt . +RUN pip install --no-cache-dir --user -r requirements.txt + +# Stage final - imagem de produção +FROM python:3.11-slim + +# Instalar dependências de runtime +RUN apt-get update && apt-get install -y \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Criar usuário não-root +RUN groupadd -r appuser && useradd -r -g appuser appuser + +# Criar diretórios necessários +RUN mkdir -p /app /tmp/reports && \ + chown -R appuser:appuser /app /tmp/reports + +# Copiar dependências Python do stage anterior +COPY --from=builder /root/.local /home/appuser/.local + +# Definir PATH para incluir dependências locais +ENV PATH=/home/appuser/.local/bin:$PATH + +# Definir diretório de trabalho +WORKDIR /app + +# Copiar código da aplicação +COPY app/ ./app/ + +# Alterar propriedade dos arquivos +RUN chown -R appuser:appuser /app + +# Mudar para usuário não-root +USER appuser + +# Expor porta +EXPOSE 8080 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8080/health || exit 1 + +# Comando para executar a aplicação +CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..80af0cf --- /dev/null +++ b/Makefile @@ -0,0 +1,139 @@ +# Makefile para OpenShift Resource Governance Tool + +# Configurações +IMAGE_NAME = resource-governance +TAG = latest +REGISTRY = quay.io/openshift +FULL_IMAGE_NAME = $(REGISTRY)/$(IMAGE_NAME):$(TAG) +NAMESPACE = resource-governance + +# Cores para output +RED = \033[0;31m +GREEN = \033[0;32m +YELLOW = \033[1;33m +BLUE = \033[0;34m +NC = \033[0m # No Color + +.PHONY: help build test deploy undeploy clean dev logs status + +help: ## Mostrar ajuda + @echo "$(BLUE)OpenShift Resource Governance Tool$(NC)" + @echo "" + @echo "Comandos disponíveis:" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf " $(GREEN)%-15s$(NC) %s\n", $$1, $$2}' + +build: ## Build da imagem Docker + @echo "$(YELLOW)📦 Building Docker image...$(NC)" + @./scripts/build.sh $(TAG) $(REGISTRY) + +test: ## Testar a aplicação + @echo "$(YELLOW)🧪 Testing application...$(NC)" + @python -c "import app.main; print('$(GREEN)✅ App imports successfully$(NC)')" + @echo "$(YELLOW)🧪 Testing API...$(NC)" + @python -m uvicorn app.main:app --host 0.0.0.0 --port 8080 & + @sleep 5 + @curl -f http://localhost:8080/health || (echo "$(RED)❌ Health check failed$(NC)" && exit 1) + @pkill -f uvicorn + @echo "$(GREEN)✅ Tests passed$(NC)" + +deploy: ## Deploy no OpenShift + @echo "$(YELLOW)🚀 Deploying to OpenShift...$(NC)" + @./scripts/deploy.sh $(TAG) $(REGISTRY) + +undeploy: ## Remover do OpenShift + @echo "$(YELLOW)🗑️ Undeploying from OpenShift...$(NC)" + @./scripts/undeploy.sh + +clean: ## Limpar recursos locais + @echo "$(YELLOW)🧹 Cleaning up...$(NC)" + @docker rmi $(FULL_IMAGE_NAME) 2>/dev/null || true + @docker system prune -f + @echo "$(GREEN)✅ Cleanup completed$(NC)" + +dev: ## Executar em modo desenvolvimento + @echo "$(YELLOW)🔧 Starting development server...$(NC)" + @python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8080 + +logs: ## Ver logs da aplicação + @echo "$(YELLOW)📋 Showing application logs...$(NC)" + @oc logs -f daemonset/$(IMAGE_NAME) -n $(NAMESPACE) + +status: ## Ver status da aplicação + @echo "$(YELLOW)📊 Application status:$(NC)" + @oc get all -n $(NAMESPACE) + @echo "" + @echo "$(YELLOW)🌐 Route URL:$(NC)" + @oc get route $(IMAGE_NAME)-route -n $(NAMESPACE) -o jsonpath='{.spec.host}' 2>/dev/null || echo "Route not found" + +install-deps: ## Instalar dependências Python + @echo "$(YELLOW)📦 Installing Python dependencies...$(NC)" + @pip install -r requirements.txt + @echo "$(GREEN)✅ Dependencies installed$(NC)" + +format: ## Formatar código Python + @echo "$(YELLOW)🎨 Formatting Python code...$(NC)" + @python -m black app/ + @python -m isort app/ + @echo "$(GREEN)✅ Code formatted$(NC)" + +lint: ## Verificar código Python + @echo "$(YELLOW)🔍 Linting Python code...$(NC)" + @python -m flake8 app/ + @python -m mypy app/ + @echo "$(GREEN)✅ Linting completed$(NC)" + +security: ## Verificar segurança + @echo "$(YELLOW)🔒 Security check...$(NC)" + @python -m bandit -r app/ + @echo "$(GREEN)✅ Security check completed$(NC)" + +all: clean install-deps format lint test build ## Executar pipeline completo + +# Comandos específicos do OpenShift +oc-login: ## Fazer login no OpenShift + @echo "$(YELLOW)🔐 Logging into OpenShift...$(NC)" + @oc login + +oc-projects: ## Listar projetos OpenShift + @echo "$(YELLOW)📋 OpenShift projects:$(NC)" + @oc get projects + +oc-ns: ## Criar namespace + @echo "$(YELLOW)📁 Creating namespace...$(NC)" + @oc apply -f k8s/namespace.yaml + +oc-rbac: ## Aplicar RBAC + @echo "$(YELLOW)🔐 Applying RBAC...$(NC)" + @oc apply -f k8s/rbac.yaml + +oc-config: ## Aplicar ConfigMap + @echo "$(YELLOW)⚙️ Applying ConfigMap...$(NC)" + @oc apply -f k8s/configmap.yaml + +oc-deploy: ## Aplicar DaemonSet + @echo "$(YELLOW)📦 Applying DaemonSet...$(NC)" + @oc apply -f k8s/daemonset.yaml + +oc-service: ## Aplicar Service + @echo "$(YELLOW)🌐 Applying Service...$(NC)" + @oc apply -f k8s/service.yaml + +oc-route: ## Aplicar Route + @echo "$(YELLOW)🛣️ Applying Route...$(NC)" + @oc apply -f k8s/route.yaml + +oc-apply: oc-ns oc-rbac oc-config oc-deploy oc-service oc-route ## Aplicar todos os recursos + +# Comandos de monitoramento +monitor: ## Monitorar aplicação + @echo "$(YELLOW)📊 Monitoring application...$(NC)" + @watch -n 5 'oc get pods -n $(NAMESPACE) && echo "" && oc get route $(IMAGE_NAME)-route -n $(NAMESPACE)' + +health: ## Verificar saúde da aplicação + @echo "$(YELLOW)🏥 Health check...$(NC)" + @ROUTE_URL=$$(oc get route $(IMAGE_NAME)-route -n $(NAMESPACE) -o jsonpath='{.spec.host}' 2>/dev/null); \ + if [ -n "$$ROUTE_URL" ]; then \ + curl -f https://$$ROUTE_URL/health || echo "$(RED)❌ Health check failed$(NC)"; \ + else \ + echo "$(RED)❌ Route not found$(NC)"; \ + fi diff --git a/README.md b/README.md new file mode 100644 index 0000000..cc08327 --- /dev/null +++ b/README.md @@ -0,0 +1,301 @@ +# OpenShift Resource Governance Tool + +Uma ferramenta de governança de recursos para clusters OpenShift que vai além do que o Metrics Server e VPA oferecem, fornecendo validações, relatórios e recomendações consolidadas. + +## 🚀 Características + +- **Coleta Automática**: Coleta requests/limits de todos os pods/containers no cluster +- **Validações Red Hat**: Valida best practices de capacity management +- **Integração VPA**: Consome recomendações do VPA em modo Off +- **Integração Prometheus**: Coleta métricas reais de consumo +- **Relatórios Consolidados**: Gera relatórios em JSON, CSV e PDF +- **UI Web**: Interface simples para visualização e interação +- **Aplicação de Recomendações**: Permite aprovar e aplicar recomendações + +## 📋 Requisitos + +- OpenShift 4.x +- Prometheus (nativo no OCP) +- VPA (opcional, para recomendações) +- Python 3.11+ +- Docker +- OpenShift CLI (oc) + +## 🛠️ Instalação + +### 1. Build da Imagem + +```bash +# Build local +./scripts/build.sh + +# Build com tag específica +./scripts/build.sh v1.0.0 + +# Build para registry específico +./scripts/build.sh latest quay.io/seu-usuario +``` + +### 2. Deploy no OpenShift + +```bash +# Deploy padrão +./scripts/deploy.sh + +# Deploy com tag específica +./scripts/deploy.sh v1.0.0 + +# Deploy para registry específico +./scripts/deploy.sh latest quay.io/seu-usuario +``` + +### 3. Acesso à Aplicação + +Após o deploy, acesse a aplicação através da rota criada: + +```bash +# Obter URL da rota +oc get route resource-governance-route -n resource-governance + +# Acessar via browser +# https://resource-governance-route-resource-governance.apps.openshift.local +``` + +## 🔧 Configuração + +### ConfigMap + +A aplicação é configurada através do ConfigMap `resource-governance-config`: + +```yaml +data: + CPU_LIMIT_RATIO: "3.0" # Ratio padrão limit:request para CPU + MEMORY_LIMIT_RATIO: "3.0" # Ratio padrão limit:request para memória + MIN_CPU_REQUEST: "10m" # Mínimo de CPU request + MIN_MEMORY_REQUEST: "32Mi" # Mínimo de memória request + CRITICAL_NAMESPACES: | # Namespaces críticos para VPA + openshift-monitoring + openshift-ingress + openshift-apiserver + PROMETHEUS_URL: "http://prometheus.openshift-monitoring.svc.cluster.local:9090" +``` + +### Variáveis de Ambiente + +- `KUBECONFIG`: Caminho para kubeconfig (usado em desenvolvimento) +- `PROMETHEUS_URL`: URL do Prometheus +- `CPU_LIMIT_RATIO`: Ratio CPU limit:request +- `MEMORY_LIMIT_RATIO`: Ratio memória limit:request +- `MIN_CPU_REQUEST`: Mínimo de CPU request +- `MIN_MEMORY_REQUEST`: Mínimo de memória request + +## 📊 Uso + +### API Endpoints + +#### Status do Cluster +```bash +GET /api/v1/cluster/status +``` + +#### Status de Namespace +```bash +GET /api/v1/namespace/{namespace}/status +``` + +#### Validações +```bash +GET /api/v1/validations?namespace=default&severity=error +``` + +#### Recomendações VPA +```bash +GET /api/v1/vpa/recommendations?namespace=default +``` + +#### Exportar Relatório +```bash +POST /api/v1/export +Content-Type: application/json + +{ + "format": "json", + "namespaces": ["default", "kube-system"], + "includeVPA": true, + "includeValidations": true +} +``` + +### Exemplos de Uso + +#### 1. Verificar Status do Cluster +```bash +curl https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/cluster/status +``` + +#### 2. Exportar Relatório CSV +```bash +curl -X POST https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/export \ + -H "Content-Type: application/json" \ + -d '{"format": "csv", "includeVPA": true}' +``` + +#### 3. Ver Validações Críticas +```bash +curl "https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/validations?severity=critical" +``` + +## 🔍 Validações Implementadas + +### 1. Requests Obrigatórios +- **Problema**: Pods sem requests definidos +- **Severidade**: Error +- **Recomendação**: Definir requests de CPU e memória + +### 2. Limits Recomendados +- **Problema**: Pods sem limits definidos +- **Severidade**: Warning +- **Recomendação**: Definir limits para evitar consumo excessivo + +### 3. Ratio Limit:Request +- **Problema**: Ratio muito alto ou baixo +- **Severidade**: Warning/Error +- **Recomendação**: Ajustar para ratio 3:1 + +### 4. Valores Mínimos +- **Problema**: Requests muito baixos +- **Severidade**: Warning +- **Recomendação**: Aumentar para valores mínimos + +### 5. Overcommit +- **Problema**: Requests excedem capacidade do cluster +- **Severidade**: Critical +- **Recomendação**: Reduzir requests ou adicionar nós + +## 📈 Relatórios + +### Formato JSON +```json +{ + "timestamp": "2024-01-15T10:30:00Z", + "total_pods": 150, + "total_namespaces": 25, + "total_nodes": 3, + "validations": [...], + "vpa_recommendations": [...], + "summary": { + "total_validations": 45, + "critical_issues": 5, + "warnings": 25, + "errors": 15 + } +} +``` + +### Formato CSV +```csv +Pod Name,Namespace,Container Name,Validation Type,Severity,Message,Recommendation +pod-1,default,nginx,missing_requests,error,Container sem requests definidos,Definir requests de CPU e memória +``` + +## 🔐 Segurança + +### RBAC +A aplicação usa um ServiceAccount dedicado com permissões mínimas: + +- **Pods**: get, list, watch, patch, update +- **Namespaces**: get, list, watch +- **Nodes**: get, list, watch +- **VPA**: get, list, watch +- **Deployments/ReplicaSets**: get, list, watch, patch, update + +### Security Context +- Executa como usuário não-root (UID 1000) +- Usa SecurityContext com runAsNonRoot: true +- Limita recursos com requests/limits + +## 🐛 Troubleshooting + +### Verificar Logs +```bash +oc logs -f daemonset/resource-governance -n resource-governance +``` + +### Verificar Status dos Pods +```bash +oc get pods -n resource-governance +oc describe pod -n resource-governance +``` + +### Verificar RBAC +```bash +oc auth can-i get pods --as=system:serviceaccount:resource-governance:resource-governance-sa +``` + +### Testar Conectividade +```bash +# Health check +curl https://resource-governance-route-resource-governance.apps.openshift.local/health + +# Teste de API +curl https://resource-governance-route-resource-governance.apps.openshift.local/api/v1/cluster/status +``` + +## 🚀 Desenvolvimento + +### Executar Localmente +```bash +# Instalar dependências +pip install -r requirements.txt + +# Executar aplicação +python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8080 +``` + +### Executar com Docker +```bash +# Build +docker build -t resource-governance . + +# Executar +docker run -p 8080:8080 resource-governance +``` + +### Testes +```bash +# Testar importação +python -c "import app.main; print('OK')" + +# Testar API +curl http://localhost:8080/health +``` + +## 📝 Roadmap + +### Próximas Versões +- [ ] UI Web com gráficos interativos +- [ ] Relatórios PDF com gráficos +- [ ] Regras customizadas por namespace +- [ ] Integração com GitOps (ArgoCD) +- [ ] Notificações via Slack/Teams +- [ ] Métricas customizadas do Prometheus +- [ ] Suporte a múltiplos clusters + +## 🤝 Contribuição + +1. Fork o projeto +2. Crie uma branch para sua feature (`git checkout -b feature/AmazingFeature`) +3. Commit suas mudanças (`git commit -m 'Add some AmazingFeature'`) +4. Push para a branch (`git push origin feature/AmazingFeature`) +5. Abra um Pull Request + +## 📄 Licença + +Este projeto está sob a licença MIT. Veja o arquivo [LICENSE](LICENSE) para detalhes. + +## 📞 Suporte + +Para suporte e dúvidas: +- Abra uma issue no GitHub +- Consulte a documentação do OpenShift +- Verifique os logs da aplicação diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..779f0f5 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1 @@ +# OpenShift Resource Governance Tool diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..2a30ed8 --- /dev/null +++ b/app/api/__init__.py @@ -0,0 +1 @@ +# API routes diff --git a/app/api/routes.py b/app/api/routes.py new file mode 100644 index 0000000..5fdafe6 --- /dev/null +++ b/app/api/routes.py @@ -0,0 +1,292 @@ +""" +Rotas da API +""" +import logging +from typing import List, Optional +from fastapi import APIRouter, HTTPException, Depends, Request +from fastapi.responses import FileResponse + +from app.models.resource_models import ( + ClusterReport, NamespaceReport, ExportRequest, + ApplyRecommendationRequest +) +from app.services.validation_service import ValidationService +from app.services.report_service import ReportService + +logger = logging.getLogger(__name__) + +# Criar router +api_router = APIRouter() + +# Inicializar serviços +validation_service = ValidationService() +report_service = ReportService() + +def get_k8s_client(request: Request): + """Dependency para obter cliente Kubernetes""" + return request.app.state.k8s_client + +def get_prometheus_client(request: Request): + """Dependency para obter cliente Prometheus""" + return request.app.state.prometheus_client + +@api_router.get("/cluster/status") +async def get_cluster_status( + k8s_client=Depends(get_k8s_client), + prometheus_client=Depends(get_prometheus_client) +): + """Obter status geral do cluster""" + try: + # Coletar dados básicos + pods = await k8s_client.get_all_pods() + nodes_info = await k8s_client.get_nodes_info() + + # Validar recursos + all_validations = [] + for pod in pods: + pod_validations = validation_service.validate_pod_resources(pod) + all_validations.extend(pod_validations) + + # Obter informações de overcommit + overcommit_info = await prometheus_client.get_cluster_overcommit() + + # Obter recomendações VPA + vpa_recommendations = await k8s_client.get_vpa_recommendations() + + # Gerar relatório + report = report_service.generate_cluster_report( + pods=pods, + validations=all_validations, + vpa_recommendations=vpa_recommendations, + overcommit_info=overcommit_info, + nodes_info=nodes_info + ) + + return report + + except Exception as e: + logger.error(f"Erro ao obter status do cluster: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/namespace/{namespace}/status") +async def get_namespace_status( + namespace: str, + k8s_client=Depends(get_k8s_client), + prometheus_client=Depends(get_prometheus_client) +): + """Obter status de um namespace específico""" + try: + # Coletar dados do namespace + namespace_resources = await k8s_client.get_namespace_resources(namespace) + + # Validar recursos + all_validations = [] + for pod in namespace_resources.pods: + pod_validations = validation_service.validate_pod_resources(pod) + all_validations.extend(pod_validations) + + # Obter uso de recursos do Prometheus + resource_usage = await prometheus_client.get_namespace_resource_usage(namespace) + + # Gerar relatório do namespace + report = report_service.generate_namespace_report( + namespace=namespace, + pods=namespace_resources.pods, + validations=all_validations, + resource_usage=resource_usage + ) + + return report + + except Exception as e: + logger.error(f"Erro ao obter status do namespace {namespace}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/pods") +async def get_pods( + namespace: Optional[str] = None, + k8s_client=Depends(get_k8s_client) +): + """Listar pods com informações de recursos""" + try: + if namespace: + namespace_resources = await k8s_client.get_namespace_resources(namespace) + return namespace_resources.pods + else: + return await k8s_client.get_all_pods() + + except Exception as e: + logger.error(f"Erro ao listar pods: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/validations") +async def get_validations( + namespace: Optional[str] = None, + severity: Optional[str] = None, + k8s_client=Depends(get_k8s_client) +): + """Listar validações de recursos""" + try: + # Coletar pods + if namespace: + namespace_resources = await k8s_client.get_namespace_resources(namespace) + pods = namespace_resources.pods + else: + pods = await k8s_client.get_all_pods() + + # Validar recursos + all_validations = [] + for pod in pods: + pod_validations = validation_service.validate_pod_resources(pod) + all_validations.extend(pod_validations) + + # Filtrar por severidade se especificado + if severity: + all_validations = [ + v for v in all_validations if v.severity == severity + ] + + return all_validations + + except Exception as e: + logger.error(f"Erro ao obter validações: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/vpa/recommendations") +async def get_vpa_recommendations( + namespace: Optional[str] = None, + k8s_client=Depends(get_k8s_client) +): + """Obter recomendações do VPA""" + try: + recommendations = await k8s_client.get_vpa_recommendations() + + if namespace: + recommendations = [ + r for r in recommendations if r.namespace == namespace + ] + + return recommendations + + except Exception as e: + logger.error(f"Erro ao obter recomendações VPA: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.post("/export") +async def export_report( + export_request: ExportRequest, + k8s_client=Depends(get_k8s_client), + prometheus_client=Depends(get_prometheus_client) +): + """Exportar relatório em diferentes formatos""" + try: + # Gerar relatório + pods = await k8s_client.get_all_pods() + nodes_info = await k8s_client.get_nodes_info() + + # Filtrar por namespaces se especificado + if export_request.namespaces: + pods = [p for p in pods if p.namespace in export_request.namespaces] + + # Validar recursos + all_validations = [] + for pod in pods: + pod_validations = validation_service.validate_pod_resources(pod) + all_validations.extend(pod_validations) + + # Obter informações adicionais + overcommit_info = {} + vpa_recommendations = [] + + if export_request.include_vpa: + vpa_recommendations = await k8s_client.get_vpa_recommendations() + + if export_request.include_validations: + overcommit_info = await prometheus_client.get_cluster_overcommit() + + # Gerar relatório + report = report_service.generate_cluster_report( + pods=pods, + validations=all_validations, + vpa_recommendations=vpa_recommendations, + overcommit_info=overcommit_info, + nodes_info=nodes_info + ) + + # Exportar + filepath = await report_service.export_report(report, export_request) + + return { + "message": "Relatório exportado com sucesso", + "filepath": filepath, + "format": export_request.format + } + + except Exception as e: + logger.error(f"Erro ao exportar relatório: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/export/files") +async def list_exported_files(): + """Listar arquivos exportados""" + try: + files = report_service.get_exported_reports() + return files + + except Exception as e: + logger.error(f"Erro ao listar arquivos exportados: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/export/files/{filename}") +async def download_exported_file(filename: str): + """Download de arquivo exportado""" + try: + files = report_service.get_exported_reports() + file_info = next((f for f in files if f["filename"] == filename), None) + + if not file_info: + raise HTTPException(status_code=404, detail="Arquivo não encontrado") + + return FileResponse( + path=file_info["filepath"], + filename=filename, + media_type='application/octet-stream' + ) + + except Exception as e: + logger.error(f"Erro ao baixar arquivo {filename}: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.post("/apply/recommendation") +async def apply_recommendation( + recommendation: ApplyRecommendationRequest, + k8s_client=Depends(get_k8s_client) +): + """Aplicar recomendação de recursos""" + try: + # TODO: Implementar aplicação de recomendações + # Por enquanto, apenas simular + if recommendation.dry_run: + return { + "message": "Dry run - recomendação seria aplicada", + "pod": recommendation.pod_name, + "namespace": recommendation.namespace, + "container": recommendation.container_name, + "action": f"{recommendation.action} {recommendation.resource_type} = {recommendation.value}" + } + else: + # Implementar aplicação real da recomendação + raise HTTPException(status_code=501, detail="Aplicação de recomendações não implementada ainda") + + except Exception as e: + logger.error(f"Erro ao aplicar recomendação: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@api_router.get("/health") +async def health_check(): + """Health check da API""" + return { + "status": "healthy", + "service": "resource-governance-api", + "version": "1.0.0" + } diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..5a60488 --- /dev/null +++ b/app/core/__init__.py @@ -0,0 +1 @@ +# Core modules diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..2516e43 --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,45 @@ +""" +Configurações da aplicação +""" +import os +from typing import List, Optional +from pydantic import BaseSettings + +class Settings(BaseSettings): + """Configurações da aplicação""" + + # Configurações do OpenShift/Kubernetes + kubeconfig_path: Optional[str] = None + cluster_url: Optional[str] = None + token: Optional[str] = None + + # Configurações do Prometheus + prometheus_url: str = "http://prometheus.openshift-monitoring.svc.cluster.local:9090" + + # Configurações de validação + cpu_limit_ratio: float = 3.0 # Ratio padrão limit:request para CPU + memory_limit_ratio: float = 3.0 # Ratio padrão limit:request para memória + min_cpu_request: str = "10m" # Mínimo de CPU request + min_memory_request: str = "32Mi" # Mínimo de memória request + + # Namespaces críticos para VPA + critical_namespaces: List[str] = [ + "openshift-monitoring", + "openshift-ingress", + "openshift-apiserver", + "openshift-controller-manager", + "openshift-sdn" + ] + + # Configurações de relatório + report_export_path: str = "/tmp/reports" + + # Configurações de segurança + enable_rbac: bool = True + service_account_name: str = "resource-governance-sa" + + class Config: + env_file = ".env" + case_sensitive = False + +settings = Settings() diff --git a/app/core/kubernetes_client.py b/app/core/kubernetes_client.py new file mode 100644 index 0000000..5160264 --- /dev/null +++ b/app/core/kubernetes_client.py @@ -0,0 +1,234 @@ +""" +Cliente Kubernetes/OpenShift para coleta de dados +""" +import logging +from typing import List, Dict, Any, Optional +from kubernetes import client, config +from kubernetes.client.rest import ApiException +import asyncio +import aiohttp + +from app.core.config import settings +from app.models.resource_models import PodResource, NamespaceResources, VPARecommendation + +logger = logging.getLogger(__name__) + +class K8sClient: + """Cliente para interação com Kubernetes/OpenShift""" + + def __init__(self): + self.v1 = None + self.autoscaling_v1 = None + self.apps_v1 = None + self.initialized = False + + async def initialize(self): + """Inicializar cliente Kubernetes""" + try: + # Tentar carregar configuração do cluster + if settings.kubeconfig_path: + config.load_kube_config(config_file=settings.kubeconfig_path) + else: + # Usar configuração in-cluster + config.load_incluster_config() + + # Inicializar clientes da API + self.v1 = client.CoreV1Api() + self.autoscaling_v1 = client.AutoscalingV1Api() + self.apps_v1 = client.AppsV1Api() + + self.initialized = True + logger.info("Cliente Kubernetes inicializado com sucesso") + + except Exception as e: + logger.error(f"Erro ao inicializar cliente Kubernetes: {e}") + raise + + async def get_all_pods(self) -> List[PodResource]: + """Coletar informações de todos os pods do cluster""" + if not self.initialized: + raise RuntimeError("Cliente Kubernetes não inicializado") + + pods_data = [] + + try: + # Listar todos os pods em todos os namespaces + pods = self.v1.list_pod_for_all_namespaces(watch=False) + + for pod in pods.items: + pod_resource = PodResource( + name=pod.metadata.name, + namespace=pod.metadata.namespace, + node_name=pod.spec.node_name, + phase=pod.status.phase, + containers=[] + ) + + # Processar containers do pod + for container in pod.spec.containers: + container_resource = { + "name": container.name, + "image": container.image, + "resources": { + "requests": {}, + "limits": {} + } + } + + # Extrair requests e limits + if container.resources: + if container.resources.requests: + container_resource["resources"]["requests"] = { + k: v for k, v in container.resources.requests.items() + } + if container.resources.limits: + container_resource["resources"]["limits"] = { + k: v for k, v in container.resources.limits.items() + } + + pod_resource.containers.append(container_resource) + + pods_data.append(pod_resource) + + logger.info(f"Coletados {len(pods_data)} pods") + return pods_data + + except ApiException as e: + logger.error(f"Erro ao listar pods: {e}") + raise + + async def get_namespace_resources(self, namespace: str) -> NamespaceResources: + """Coletar recursos de um namespace específico""" + if not self.initialized: + raise RuntimeError("Cliente Kubernetes não inicializado") + + try: + # Listar pods do namespace + pods = self.v1.list_namespaced_pod(namespace=namespace) + + namespace_resource = NamespaceResources( + name=namespace, + pods=[], + total_cpu_requests="0", + total_cpu_limits="0", + total_memory_requests="0", + total_memory_limits="0" + ) + + for pod in pods.items: + pod_resource = PodResource( + name=pod.metadata.name, + namespace=pod.metadata.namespace, + node_name=pod.spec.node_name, + phase=pod.status.phase, + containers=[] + ) + + for container in pod.spec.containers: + container_resource = { + "name": container.name, + "image": container.image, + "resources": { + "requests": {}, + "limits": {} + } + } + + if container.resources: + if container.resources.requests: + container_resource["resources"]["requests"] = { + k: v for k, v in container.resources.requests.items() + } + if container.resources.limits: + container_resource["resources"]["limits"] = { + k: v for k, v in container.resources.limits.items() + } + + pod_resource.containers.append(container_resource) + + namespace_resource.pods.append(pod_resource) + + return namespace_resource + + except ApiException as e: + logger.error(f"Erro ao coletar recursos do namespace {namespace}: {e}") + raise + + async def get_vpa_recommendations(self) -> List[VPARecommendation]: + """Coletar recomendações do VPA""" + if not self.initialized: + raise RuntimeError("Cliente Kubernetes não inicializado") + + recommendations = [] + + try: + # Listar VPA objects em todos os namespaces + vpa_list = self.autoscaling_v1.list_vertical_pod_autoscaler_for_all_namespaces() + + for vpa in vpa_list.items: + if vpa.status and vpa.status.recommendation: + recommendation = VPARecommendation( + name=vpa.metadata.name, + namespace=vpa.metadata.namespace, + target_ref=vpa.spec.target_ref, + recommendations=vpa.status.recommendation + ) + recommendations.append(recommendation) + + logger.info(f"Coletadas {len(recommendations)} recomendações VPA") + return recommendations + + except ApiException as e: + logger.error(f"Erro ao coletar recomendações VPA: {e}") + # VPA pode não estar instalado, retornar lista vazia + return [] + + async def get_nodes_info(self) -> List[Dict[str, Any]]: + """Coletar informações dos nós do cluster""" + if not self.initialized: + raise RuntimeError("Cliente Kubernetes não inicializado") + + try: + nodes = self.v1.list_node() + nodes_info = [] + + for node in nodes.items: + node_info = { + "name": node.metadata.name, + "labels": node.metadata.labels or {}, + "capacity": {}, + "allocatable": {}, + "conditions": [] + } + + # Capacidade do nó + if node.status.capacity: + node_info["capacity"] = { + k: v for k, v in node.status.capacity.items() + } + + # Recursos alocáveis + if node.status.allocatable: + node_info["allocatable"] = { + k: v for k, v in node.status.allocatable.items() + } + + # Condições do nó + if node.status.conditions: + node_info["conditions"] = [ + { + "type": condition.type, + "status": condition.status, + "reason": condition.reason, + "message": condition.message + } + for condition in node.status.conditions + ] + + nodes_info.append(node_info) + + return nodes_info + + except ApiException as e: + logger.error(f"Erro ao coletar informações dos nós: {e}") + raise diff --git a/app/core/prometheus_client.py b/app/core/prometheus_client.py new file mode 100644 index 0000000..73213b8 --- /dev/null +++ b/app/core/prometheus_client.py @@ -0,0 +1,131 @@ +""" +Cliente Prometheus para coleta de métricas +""" +import logging +import aiohttp +import asyncio +from typing import Dict, List, Any, Optional +from datetime import datetime, timedelta + +from app.core.config import settings + +logger = logging.getLogger(__name__) + +class PrometheusClient: + """Cliente para interação com Prometheus""" + + def __init__(self): + self.base_url = settings.prometheus_url + self.session = None + self.initialized = False + + async def initialize(self): + """Inicializar cliente Prometheus""" + try: + self.session = aiohttp.ClientSession() + + # Testar conexão + async with self.session.get(f"{self.base_url}/api/v1/query?query=up") as response: + if response.status == 200: + self.initialized = True + logger.info("Cliente Prometheus inicializado com sucesso") + else: + logger.warning(f"Prometheus retornou status {response.status}") + + except Exception as e: + logger.error(f"Erro ao inicializar cliente Prometheus: {e}") + # Prometheus pode não estar disponível, continuar sem ele + self.initialized = False + + async def query(self, query: str, time: Optional[datetime] = None) -> Dict[str, Any]: + """Executar query no Prometheus""" + if not self.initialized or not self.session: + return {"status": "error", "message": "Prometheus não disponível"} + + try: + params = {"query": query} + if time: + params["time"] = int(time.timestamp()) + + async with self.session.get( + f"{self.base_url}/api/v1/query", + params=params + ) as response: + if response.status == 200: + data = await response.json() + return data + else: + logger.error(f"Erro na query Prometheus: {response.status}") + return {"status": "error", "message": f"HTTP {response.status}"} + + except Exception as e: + logger.error(f"Erro ao executar query Prometheus: {e}") + return {"status": "error", "message": str(e)} + + async def get_pod_cpu_usage(self, namespace: str, pod_name: str) -> Dict[str, Any]: + """Obter uso de CPU de um pod específico""" + query = f'rate(container_cpu_usage_seconds_total{{namespace="{namespace}", pod="{pod_name}"}}[5m])' + return await self.query(query) + + async def get_pod_memory_usage(self, namespace: str, pod_name: str) -> Dict[str, Any]: + """Obter uso de memória de um pod específico""" + query = f'container_memory_working_set_bytes{{namespace="{namespace}", pod="{pod_name}"}}' + return await self.query(query) + + async def get_namespace_resource_usage(self, namespace: str) -> Dict[str, Any]: + """Obter uso de recursos de um namespace""" + cpu_query = f'sum(rate(container_cpu_usage_seconds_total{{namespace="{namespace}"}}[5m]))' + memory_query = f'sum(container_memory_working_set_bytes{{namespace="{namespace}"}})' + + cpu_result = await self.query(cpu_query) + memory_result = await self.query(memory_query) + + return { + "cpu": cpu_result, + "memory": memory_result + } + + async def get_cluster_overcommit(self) -> Dict[str, Any]: + """Verificar overcommit no cluster""" + # CPU overcommit + cpu_capacity_query = 'sum(kube_node_status_capacity{resource="cpu"})' + cpu_requests_query = 'sum(kube_pod_container_resource_requests{resource="cpu"})' + + # Memory overcommit + memory_capacity_query = 'sum(kube_node_status_capacity{resource="memory"})' + memory_requests_query = 'sum(kube_pod_container_resource_requests{resource="memory"})' + + cpu_capacity = await self.query(cpu_capacity_query) + cpu_requests = await self.query(cpu_requests_query) + memory_capacity = await self.query(memory_capacity_query) + memory_requests = await self.query(memory_requests_query) + + return { + "cpu": { + "capacity": cpu_capacity, + "requests": cpu_requests + }, + "memory": { + "capacity": memory_capacity, + "requests": memory_requests + } + } + + async def get_node_resource_usage(self) -> List[Dict[str, Any]]: + """Obter uso de recursos por nó""" + query = ''' + ( + kube_node_status_capacity{resource="cpu"} or + kube_node_status_capacity{resource="memory"} or + kube_pod_container_resource_requests{resource="cpu"} or + kube_pod_container_resource_requests{resource="memory"} + ) + ''' + + result = await self.query(query) + return result + + async def close(self): + """Fechar sessão HTTP""" + if self.session: + await self.session.close() diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..a8937a6 --- /dev/null +++ b/app/main.py @@ -0,0 +1,81 @@ +""" +OpenShift Resource Governance Tool +Aplicação para governança de recursos no cluster OpenShift +""" +import os +import logging +from fastapi import FastAPI, HTTPException, Depends +from fastapi.staticfiles import StaticFiles +from fastapi.responses import HTMLResponse +from contextlib import asynccontextmanager + +from app.core.config import settings +from app.api.routes import api_router +from app.core.kubernetes_client import K8sClient +from app.core.prometheus_client import PrometheusClient + +# Configuração de logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Inicialização e cleanup da aplicação""" + logger.info("Iniciando OpenShift Resource Governance Tool") + + # Inicializar clientes + app.state.k8s_client = K8sClient() + app.state.prometheus_client = PrometheusClient() + + try: + await app.state.k8s_client.initialize() + await app.state.prometheus_client.initialize() + logger.info("Clientes inicializados com sucesso") + except Exception as e: + logger.error(f"Erro ao inicializar clientes: {e}") + raise + + yield + + logger.info("Finalizando aplicação") + +# Criar aplicação FastAPI +app = FastAPI( + title="OpenShift Resource Governance Tool", + description="Ferramenta de governança de recursos para clusters OpenShift", + version="1.0.0", + lifespan=lifespan +) + +# Incluir rotas da API +app.include_router(api_router, prefix="/api/v1") + +# Servir arquivos estáticos +app.mount("/static", StaticFiles(directory="app/static"), name="static") + +@app.get("/", response_class=HTMLResponse) +async def root(): + """Página principal da aplicação""" + with open("app/static/index.html", "r") as f: + return HTMLResponse(content=f.read()) + +@app.get("/health") +async def health_check(): + """Health check endpoint""" + return { + "status": "healthy", + "service": "openshift-resource-governance", + "version": "1.0.0" + } + +if __name__ == "__main__": + import uvicorn + uvicorn.run( + "app.main:app", + host="0.0.0.0", + port=8080, + reload=True + ) diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..9136172 --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1 @@ +# Models diff --git a/app/models/resource_models.py b/app/models/resource_models.py new file mode 100644 index 0000000..ec995cc --- /dev/null +++ b/app/models/resource_models.py @@ -0,0 +1,82 @@ +""" +Modelos de dados para recursos Kubernetes +""" +from typing import List, Dict, Any, Optional +from pydantic import BaseModel + +class ContainerResource(BaseModel): + """Recursos de um container""" + name: str + image: str + resources: Dict[str, Dict[str, str]] + +class PodResource(BaseModel): + """Recursos de um pod""" + name: str + namespace: str + node_name: Optional[str] = None + phase: str + containers: List[ContainerResource] + +class NamespaceResources(BaseModel): + """Recursos de um namespace""" + name: str + pods: List[PodResource] + total_cpu_requests: str = "0" + total_cpu_limits: str = "0" + total_memory_requests: str = "0" + total_memory_limits: str = "0" + +class VPARecommendation(BaseModel): + """Recomendação do VPA""" + name: str + namespace: str + target_ref: Dict[str, str] + recommendations: Dict[str, Any] + +class ResourceValidation(BaseModel): + """Resultado de validação de recursos""" + pod_name: str + namespace: str + container_name: str + validation_type: str # "missing_requests", "missing_limits", "invalid_ratio", "overcommit" + severity: str # "warning", "error", "critical" + message: str + recommendation: Optional[str] = None + +class ClusterReport(BaseModel): + """Relatório do cluster""" + timestamp: str + total_pods: int + total_namespaces: int + total_nodes: int + validations: List[ResourceValidation] + vpa_recommendations: List[VPARecommendation] + overcommit_info: Dict[str, Any] + summary: Dict[str, Any] + +class NamespaceReport(BaseModel): + """Relatório de um namespace""" + namespace: str + timestamp: str + total_pods: int + validations: List[ResourceValidation] + resource_usage: Dict[str, Any] + recommendations: List[str] + +class ExportRequest(BaseModel): + """Request para exportar relatório""" + format: str # "json", "csv", "pdf" + namespaces: Optional[List[str]] = None + include_vpa: bool = True + include_validations: bool = True + +class ApplyRecommendationRequest(BaseModel): + """Request para aplicar recomendação""" + pod_name: str + namespace: str + container_name: str + resource_type: str # "cpu", "memory" + action: str # "requests", "limits" + value: str + dry_run: bool = True diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..8e5b66b --- /dev/null +++ b/app/services/__init__.py @@ -0,0 +1 @@ +# Services diff --git a/app/services/report_service.py b/app/services/report_service.py new file mode 100644 index 0000000..4c9cd57 --- /dev/null +++ b/app/services/report_service.py @@ -0,0 +1,306 @@ +""" +Serviço de geração de relatórios +""" +import logging +import json +import csv +import os +from datetime import datetime +from typing import List, Dict, Any, Optional +from io import StringIO + +from app.models.resource_models import ( + ClusterReport, NamespaceReport, ResourceValidation, + VPARecommendation, ExportRequest +) +from app.core.config import settings + +logger = logging.getLogger(__name__) + +class ReportService: + """Serviço para geração de relatórios""" + + def __init__(self): + self.export_path = settings.report_export_path + os.makedirs(self.export_path, exist_ok=True) + + def generate_cluster_report( + self, + pods: List[Any], + validations: List[ResourceValidation], + vpa_recommendations: List[VPARecommendation], + overcommit_info: Dict[str, Any], + nodes_info: List[Dict[str, Any]] + ) -> ClusterReport: + """Gerar relatório do cluster""" + + # Contar namespaces únicos + namespaces = set(pod.namespace for pod in pods) + + # Gerar resumo + summary = self._generate_summary(validations, vpa_recommendations, overcommit_info) + + report = ClusterReport( + timestamp=datetime.now().isoformat(), + total_pods=len(pods), + total_namespaces=len(namespaces), + total_nodes=len(nodes_info), + validations=validations, + vpa_recommendations=vpa_recommendations, + overcommit_info=overcommit_info, + summary=summary + ) + + return report + + def generate_namespace_report( + self, + namespace: str, + pods: List[Any], + validations: List[ResourceValidation], + resource_usage: Dict[str, Any] + ) -> NamespaceReport: + """Gerar relatório de um namespace""" + + # Filtrar validações do namespace + namespace_validations = [ + v for v in validations if v.namespace == namespace + ] + + # Gerar recomendações + recommendations = self._generate_namespace_recommendations(namespace_validations) + + report = NamespaceReport( + namespace=namespace, + timestamp=datetime.now().isoformat(), + total_pods=len(pods), + validations=namespace_validations, + resource_usage=resource_usage, + recommendations=recommendations + ) + + return report + + def _generate_summary( + self, + validations: List[ResourceValidation], + vpa_recommendations: List[VPARecommendation], + overcommit_info: Dict[str, Any] + ) -> Dict[str, Any]: + """Gerar resumo do relatório""" + + # Contar validações por severidade + severity_counts = {} + for validation in validations: + severity = validation.severity + if severity not in severity_counts: + severity_counts[severity] = 0 + severity_counts[severity] += 1 + + # Contar validações por tipo + type_counts = {} + for validation in validations: + validation_type = validation.validation_type + if validation_type not in type_counts: + type_counts[validation_type] = 0 + type_counts[validation_type] += 1 + + return { + "total_validations": len(validations), + "severity_breakdown": severity_counts, + "validation_types": type_counts, + "vpa_recommendations_count": len(vpa_recommendations), + "overcommit_detected": overcommit_info.get("overcommit_detected", False), + "critical_issues": severity_counts.get("critical", 0), + "warnings": severity_counts.get("warning", 0), + "errors": severity_counts.get("error", 0) + } + + def _generate_namespace_recommendations( + self, + validations: List[ResourceValidation] + ) -> List[str]: + """Gerar recomendações para um namespace""" + recommendations = [] + + # Agrupar por tipo de problema + problems = {} + for validation in validations: + problem_type = validation.validation_type + if problem_type not in problems: + problems[problem_type] = [] + problems[problem_type].append(validation) + + # Gerar recomendações específicas + if "missing_requests" in problems: + count = len(problems["missing_requests"]) + recommendations.append( + f"Criar LimitRange para definir requests padrão " + f"({count} containers sem requests)" + ) + + if "missing_limits" in problems: + count = len(problems["missing_limits"]) + recommendations.append( + f"Definir limits para {count} containers para evitar consumo excessivo" + ) + + if "invalid_ratio" in problems: + count = len(problems["invalid_ratio"]) + recommendations.append( + f"Ajustar ratio limit:request para {count} containers" + ) + + if "overcommit" in problems: + recommendations.append( + "Resolver overcommit de recursos no namespace" + ) + + return recommendations + + async def export_report( + self, + report: ClusterReport, + export_request: ExportRequest + ) -> str: + """Exportar relatório em diferentes formatos""" + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + if export_request.format == "json": + return await self._export_json(report, timestamp) + elif export_request.format == "csv": + return await self._export_csv(report, timestamp) + elif export_request.format == "pdf": + return await self._export_pdf(report, timestamp) + else: + raise ValueError(f"Formato não suportado: {export_request.format}") + + async def _export_json(self, report: ClusterReport, timestamp: str) -> str: + """Exportar relatório em JSON""" + filename = f"cluster_report_{timestamp}.json" + filepath = os.path.join(self.export_path, filename) + + # Converter para dict para serialização + report_dict = report.dict() + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(report_dict, f, indent=2, ensure_ascii=False) + + logger.info(f"Relatório JSON exportado: {filepath}") + return filepath + + async def _export_csv(self, report: ClusterReport, timestamp: str) -> str: + """Exportar relatório em CSV""" + filename = f"cluster_report_{timestamp}.csv" + filepath = os.path.join(self.export_path, filename) + + with open(filepath, 'w', newline='', encoding='utf-8') as f: + writer = csv.writer(f) + + # Cabeçalho + writer.writerow([ + "Pod Name", "Namespace", "Container Name", + "Validation Type", "Severity", "Message", "Recommendation" + ]) + + # Dados das validações + for validation in report.validations: + writer.writerow([ + validation.pod_name, + validation.namespace, + validation.container_name, + validation.validation_type, + validation.severity, + validation.message, + validation.recommendation or "" + ]) + + logger.info(f"Relatório CSV exportado: {filepath}") + return filepath + + async def _export_pdf(self, report: ClusterReport, timestamp: str) -> str: + """Exportar relatório em PDF""" + try: + from reportlab.lib.pagesizes import letter + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle + from reportlab.lib.styles import getSampleStyleSheet + from reportlab.lib import colors + + filename = f"cluster_report_{timestamp}.pdf" + filepath = os.path.join(self.export_path, filename) + + doc = SimpleDocTemplate(filepath, pagesize=letter) + styles = getSampleStyleSheet() + story = [] + + # Título + title = Paragraph("OpenShift Resource Governance Report", styles['Title']) + story.append(title) + story.append(Spacer(1, 12)) + + # Resumo + summary_text = f""" + Resumo do Cluster:
+ Total de Pods: {report.total_pods}
+ Total de Namespaces: {report.total_namespaces}
+ Total de Nós: {report.total_nodes}
+ Total de Validações: {report.summary['total_validations']}
+ Problemas Críticos: {report.summary['critical_issues']}
+ """ + story.append(Paragraph(summary_text, styles['Normal'])) + story.append(Spacer(1, 12)) + + # Tabela de validações + if report.validations: + data = [["Pod", "Namespace", "Container", "Tipo", "Severidade", "Mensagem"]] + for validation in report.validations[:50]: # Limitar a 50 para PDF + data.append([ + validation.pod_name, + validation.namespace, + validation.container_name, + validation.validation_type, + validation.severity, + validation.message[:50] + "..." if len(validation.message) > 50 else validation.message + ]) + + table = Table(data) + table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('FONTSIZE', (0, 0), (-1, 0), 14), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + + story.append(Paragraph("Validações:", styles['Heading2'])) + story.append(table) + + doc.build(story) + logger.info(f"Relatório PDF exportado: {filepath}") + return filepath + + except ImportError: + logger.error("reportlab não instalado. Instale com: pip install reportlab") + raise ValueError("PDF export requer reportlab") + + def get_exported_reports(self) -> List[Dict[str, str]]: + """Listar relatórios exportados""" + reports = [] + + for filename in os.listdir(self.export_path): + if filename.endswith(('.json', '.csv', '.pdf')): + filepath = os.path.join(self.export_path, filename) + stat = os.stat(filepath) + reports.append({ + "filename": filename, + "filepath": filepath, + "size": stat.st_size, + "created": datetime.fromtimestamp(stat.st_ctime).isoformat(), + "format": filename.split('.')[-1] + }) + + return sorted(reports, key=lambda x: x["created"], reverse=True) diff --git a/app/services/validation_service.py b/app/services/validation_service.py new file mode 100644 index 0000000..606b77b --- /dev/null +++ b/app/services/validation_service.py @@ -0,0 +1,345 @@ +""" +Serviço de validação de recursos seguindo best practices Red Hat +""" +import logging +from typing import List, Dict, Any +from decimal import Decimal, InvalidOperation +import re + +from app.models.resource_models import PodResource, ResourceValidation, NamespaceResources +from app.core.config import settings + +logger = logging.getLogger(__name__) + +class ValidationService: + """Serviço para validação de recursos""" + + def __init__(self): + self.cpu_ratio = settings.cpu_limit_ratio + self.memory_ratio = settings.memory_limit_ratio + self.min_cpu_request = settings.min_cpu_request + self.min_memory_request = settings.min_memory_request + + def validate_pod_resources(self, pod: PodResource) -> List[ResourceValidation]: + """Validar recursos de um pod""" + validations = [] + + for container in pod.containers: + container_validations = self._validate_container_resources( + pod.name, pod.namespace, container + ) + validations.extend(container_validations) + + return validations + + def _validate_container_resources( + self, + pod_name: str, + namespace: str, + container: Dict[str, Any] + ) -> List[ResourceValidation]: + """Validar recursos de um container""" + validations = [] + resources = container.get("resources", {}) + requests = resources.get("requests", {}) + limits = resources.get("limits", {}) + + # 1. Verificar se requests estão definidos + if not requests: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container["name"], + validation_type="missing_requests", + severity="error", + message="Container sem requests definidos", + recommendation="Definir requests de CPU e memória para garantir QoS" + )) + + # 2. Verificar se limits estão definidos + if not limits: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container["name"], + validation_type="missing_limits", + severity="warning", + message="Container sem limits definidos", + recommendation="Definir limits para evitar consumo excessivo de recursos" + )) + + # 3. Validar ratio limit:request + if requests and limits: + cpu_validation = self._validate_cpu_ratio( + pod_name, namespace, container["name"], requests, limits + ) + if cpu_validation: + validations.append(cpu_validation) + + memory_validation = self._validate_memory_ratio( + pod_name, namespace, container["name"], requests, limits + ) + if memory_validation: + validations.append(memory_validation) + + # 4. Validar valores mínimos + if requests: + min_validation = self._validate_minimum_values( + pod_name, namespace, container["name"], requests + ) + validations.extend(min_validation) + + return validations + + def _validate_cpu_ratio( + self, + pod_name: str, + namespace: str, + container_name: str, + requests: Dict[str, str], + limits: Dict[str, str] + ) -> ResourceValidation: + """Validar ratio CPU limit:request""" + if "cpu" not in requests or "cpu" not in limits: + return None + + try: + request_value = self._parse_cpu_value(requests["cpu"]) + limit_value = self._parse_cpu_value(limits["cpu"]) + + if request_value > 0: + ratio = limit_value / request_value + + if ratio > self.cpu_ratio * 1.5: # 50% de tolerância + return ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="invalid_ratio", + severity="warning", + message=f"Ratio CPU limit:request muito alto ({ratio:.2f}:1)", + recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.cpu_ratio}:1)" + ) + elif ratio < 1.0: + return ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="invalid_ratio", + severity="error", + message=f"CPU limit menor que request ({ratio:.2f}:1)", + recommendation="CPU limit deve ser maior ou igual ao request" + ) + + except (ValueError, InvalidOperation) as e: + logger.warning(f"Erro ao validar ratio CPU: {e}") + + return None + + def _validate_memory_ratio( + self, + pod_name: str, + namespace: str, + container_name: str, + requests: Dict[str, str], + limits: Dict[str, str] + ) -> ResourceValidation: + """Validar ratio memória limit:request""" + if "memory" not in requests or "memory" not in limits: + return None + + try: + request_value = self._parse_memory_value(requests["memory"]) + limit_value = self._parse_memory_value(limits["memory"]) + + if request_value > 0: + ratio = limit_value / request_value + + if ratio > self.memory_ratio * 1.5: # 50% de tolerância + return ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="invalid_ratio", + severity="warning", + message=f"Ratio memória limit:request muito alto ({ratio:.2f}:1)", + recommendation=f"Considerar reduzir limits ou aumentar requests (ratio recomendado: {self.memory_ratio}:1)" + ) + elif ratio < 1.0: + return ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="invalid_ratio", + severity="error", + message=f"Memória limit menor que request ({ratio:.2f}:1)", + recommendation="Memória limit deve ser maior ou igual ao request" + ) + + except (ValueError, InvalidOperation) as e: + logger.warning(f"Erro ao validar ratio memória: {e}") + + return None + + def _validate_minimum_values( + self, + pod_name: str, + namespace: str, + container_name: str, + requests: Dict[str, str] + ) -> List[ResourceValidation]: + """Validar valores mínimos de requests""" + validations = [] + + # Validar CPU mínima + if "cpu" in requests: + try: + request_value = self._parse_cpu_value(requests["cpu"]) + min_value = self._parse_cpu_value(self.min_cpu_request) + + if request_value < min_value: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="minimum_value", + severity="warning", + message=f"CPU request muito baixo ({requests['cpu']})", + recommendation=f"Considerar aumentar para pelo menos {self.min_cpu_request}" + )) + except (ValueError, InvalidOperation): + pass + + # Validar memória mínima + if "memory" in requests: + try: + request_value = self._parse_memory_value(requests["memory"]) + min_value = self._parse_memory_value(self.min_memory_request) + + if request_value < min_value: + validations.append(ResourceValidation( + pod_name=pod_name, + namespace=namespace, + container_name=container_name, + validation_type="minimum_value", + severity="warning", + message=f"Memória request muito baixa ({requests['memory']})", + recommendation=f"Considerar aumentar para pelo menos {self.min_memory_request}" + )) + except (ValueError, InvalidOperation): + pass + + return validations + + def _parse_cpu_value(self, value: str) -> float: + """Converter valor de CPU para float (cores)""" + if value.endswith('m'): + return float(value[:-1]) / 1000 + elif value.endswith('n'): + return float(value[:-1]) / 1000000000 + else: + return float(value) + + def _parse_memory_value(self, value: str) -> int: + """Converter valor de memória para bytes""" + value = value.upper() + + if value.endswith('KI'): + return int(float(value[:-2]) * 1024) + elif value.endswith('MI'): + return int(float(value[:-2]) * 1024 * 1024) + elif value.endswith('GI'): + return int(float(value[:-2]) * 1024 * 1024 * 1024) + elif value.endswith('K'): + return int(float(value[:-1]) * 1000) + elif value.endswith('M'): + return int(float(value[:-1]) * 1000 * 1000) + elif value.endswith('G'): + return int(float(value[:-1]) * 1000 * 1000 * 1000) + else: + return int(value) + + def validate_namespace_overcommit( + self, + namespace_resources: NamespaceResources, + node_capacity: Dict[str, str] + ) -> List[ResourceValidation]: + """Validar overcommit em um namespace""" + validations = [] + + # Calcular total de requests do namespace + total_cpu_requests = self._parse_cpu_value(namespace_resources.total_cpu_requests) + total_memory_requests = self._parse_memory_value(namespace_resources.total_memory_requests) + + # Calcular capacidade total dos nós + total_cpu_capacity = self._parse_cpu_value(node_capacity.get("cpu", "0")) + total_memory_capacity = self._parse_memory_value(node_capacity.get("memory", "0")) + + # Verificar overcommit de CPU + if total_cpu_capacity > 0: + cpu_utilization = (total_cpu_requests / total_cpu_capacity) * 100 + if cpu_utilization > 100: + validations.append(ResourceValidation( + pod_name="namespace", + namespace=namespace_resources.name, + container_name="all", + validation_type="overcommit", + severity="critical", + message=f"Overcommit de CPU no namespace: {cpu_utilization:.1f}%", + recommendation="Reduzir requests de CPU ou adicionar mais nós ao cluster" + )) + + # Verificar overcommit de memória + if total_memory_capacity > 0: + memory_utilization = (total_memory_requests / total_memory_capacity) * 100 + if memory_utilization > 100: + validations.append(ResourceValidation( + pod_name="namespace", + namespace=namespace_resources.name, + container_name="all", + validation_type="overcommit", + severity="critical", + message=f"Overcommit de memória no namespace: {memory_utilization:.1f}%", + recommendation="Reduzir requests de memória ou adicionar mais nós ao cluster" + )) + + return validations + + def generate_recommendations(self, validations: List[ResourceValidation]) -> List[str]: + """Gerar recomendações baseadas nas validações""" + recommendations = [] + + # Agrupar validações por tipo + validation_counts = {} + for validation in validations: + validation_type = validation.validation_type + if validation_type not in validation_counts: + validation_counts[validation_type] = 0 + validation_counts[validation_type] += 1 + + # Gerar recomendações baseadas nos problemas encontrados + if validation_counts.get("missing_requests", 0) > 0: + recommendations.append( + f"Implementar LimitRange no namespace para definir requests padrão " + f"({validation_counts['missing_requests']} containers sem requests)" + ) + + if validation_counts.get("missing_limits", 0) > 0: + recommendations.append( + f"Definir limits para {validation_counts['missing_limits']} containers " + "para evitar consumo excessivo de recursos" + ) + + if validation_counts.get("invalid_ratio", 0) > 0: + recommendations.append( + f"Ajustar ratio limit:request para {validation_counts['invalid_ratio']} containers " + f"(recomendado: {self.cpu_ratio}:1)" + ) + + if validation_counts.get("overcommit", 0) > 0: + recommendations.append( + f"Resolver overcommit em {validation_counts['overcommit']} namespaces " + "para evitar problemas de performance" + ) + + return recommendations diff --git a/app/static/index.html b/app/static/index.html new file mode 100644 index 0000000..f11e6c1 --- /dev/null +++ b/app/static/index.html @@ -0,0 +1,530 @@ + + + + + + OpenShift Resource Governance Tool + + + +
+

OpenShift Resource Governance Tool

+

Ferramenta de governança de recursos para clusters OpenShift

+
+ +
+ +
+
+
-
+
Total de Pods
+
+
+
-
+
Namespaces
+
+
+
-
+
Nós
+
+
+
-
+
Problemas Críticos
+
+
+ + +
+

Controles

+
+ + + +
+
+ + +
+

Exportar Relatórios

+
+ + + + + +
+
+ + + + + + + + + + + + + + + +
+ + + + diff --git a/k8s/configmap.yaml b/k8s/configmap.yaml new file mode 100644 index 0000000..0e258c9 --- /dev/null +++ b/k8s/configmap.yaml @@ -0,0 +1,32 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: resource-governance-config + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +data: + # Configurações da aplicação + CPU_LIMIT_RATIO: "3.0" + MEMORY_LIMIT_RATIO: "3.0" + MIN_CPU_REQUEST: "10m" + MIN_MEMORY_REQUEST: "32Mi" + + # Namespaces críticos para VPA + CRITICAL_NAMESPACES: | + openshift-monitoring + openshift-ingress + openshift-apiserver + openshift-controller-manager + openshift-sdn + + # URL do Prometheus + PROMETHEUS_URL: "http://prometheus.openshift-monitoring.svc.cluster.local:9090" + + # Configurações de relatório + REPORT_EXPORT_PATH: "/tmp/reports" + + # Configurações de segurança + ENABLE_RBAC: "true" + SERVICE_ACCOUNT_NAME: "resource-governance-sa" diff --git a/k8s/daemonset.yaml b/k8s/daemonset.yaml new file mode 100644 index 0000000..73c2dab --- /dev/null +++ b/k8s/daemonset.yaml @@ -0,0 +1,122 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: resource-governance + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +spec: + selector: + matchLabels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance + template: + metadata: + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance + spec: + serviceAccountName: resource-governance-sa + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + containers: + - name: resource-governance + image: resource-governance:latest + imagePullPolicy: Always + ports: + - containerPort: 8080 + name: http + protocol: TCP + env: + - name: KUBECONFIG + value: "/var/run/secrets/kubernetes.io/serviceaccount/token" + - name: CPU_LIMIT_RATIO + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: CPU_LIMIT_RATIO + - name: MEMORY_LIMIT_RATIO + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: MEMORY_LIMIT_RATIO + - name: MIN_CPU_REQUEST + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: MIN_CPU_REQUEST + - name: MIN_MEMORY_REQUEST + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: MIN_MEMORY_REQUEST + - name: CRITICAL_NAMESPACES + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: CRITICAL_NAMESPACES + - name: PROMETHEUS_URL + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: PROMETHEUS_URL + - name: REPORT_EXPORT_PATH + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: REPORT_EXPORT_PATH + - name: ENABLE_RBAC + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: ENABLE_RBAC + - name: SERVICE_ACCOUNT_NAME + valueFrom: + configMapKeyRef: + name: resource-governance-config + key: SERVICE_ACCOUNT_NAME + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 3 + volumeMounts: + - name: reports-volume + mountPath: /tmp/reports + - name: tmp-volume + mountPath: /tmp + volumes: + - name: reports-volume + emptyDir: {} + - name: tmp-volume + emptyDir: {} + nodeSelector: + kubernetes.io/os: linux + tolerations: + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule diff --git a/k8s/kustomization.yaml b/k8s/kustomization.yaml new file mode 100644 index 0000000..0a37f9e --- /dev/null +++ b/k8s/kustomization.yaml @@ -0,0 +1,19 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: +- namespace.yaml +- rbac.yaml +- configmap.yaml +- daemonset.yaml +- service.yaml +- route.yaml + +commonLabels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance + app.kubernetes.io/part-of: openshift-governance + +images: +- name: resource-governance + newTag: latest diff --git a/k8s/namespace.yaml b/k8s/namespace.yaml new file mode 100644 index 0000000..b23acb9 --- /dev/null +++ b/k8s/namespace.yaml @@ -0,0 +1,36 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: resource-governance + labels: + name: resource-governance + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +--- +apiVersion: v1 +kind: ResourceQuota +metadata: + name: resource-governance-quota + namespace: resource-governance +spec: + hard: + requests.cpu: "2" + requests.memory: 4Gi + limits.cpu: "4" + limits.memory: 8Gi + pods: "10" +--- +apiVersion: v1 +kind: LimitRange +metadata: + name: resource-governance-limits + namespace: resource-governance +spec: + limits: + - default: + cpu: "500m" + memory: "512Mi" + defaultRequest: + cpu: "100m" + memory: "128Mi" + type: Container diff --git a/k8s/rbac.yaml b/k8s/rbac.yaml new file mode 100644 index 0000000..c708e88 --- /dev/null +++ b/k8s/rbac.yaml @@ -0,0 +1,93 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: resource-governance-sa + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: resource-governance-role + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +rules: +# Permissões para listar e ler pods +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] +# Permissões para listar e ler namespaces +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "watch"] +# Permissões para listar e ler nós +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +# Permissões para VPA (Vertical Pod Autoscaler) +- apiGroups: ["autoscaling.k8s.io"] + resources: ["verticalpodautoscalers"] + verbs: ["get", "list", "watch"] +# Permissões para deployments e replicasets (para aplicar recomendações) +- apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "patch", "update"] +# Permissões para pods (para aplicar recomendações) +- apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch", "patch", "update"] +# Permissões para eventos (para logging) +- apiGroups: [""] + resources: ["events"] + verbs: ["get", "list", "watch", "create"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: resource-governance-binding + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: resource-governance-role +subjects: +- kind: ServiceAccount + name: resource-governance-sa + namespace: resource-governance +--- +# Role para acessar recursos do Prometheus (se necessário) +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: resource-governance-prometheus-role + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +rules: +# Permissões para acessar serviços do Prometheus +- apiGroups: [""] + resources: ["services", "endpoints"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: resource-governance-prometheus-binding + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: resource-governance-prometheus-role +subjects: +- kind: ServiceAccount + name: resource-governance-sa + namespace: resource-governance diff --git a/k8s/route.yaml b/k8s/route.yaml new file mode 100644 index 0000000..7c04f47 --- /dev/null +++ b/k8s/route.yaml @@ -0,0 +1,23 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: resource-governance-route + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance + annotations: + haproxy.router.openshift.io/timeout: "300s" + haproxy.router.openshift.io/rate-limit: "100" +spec: + host: resource-governance.apps.openshift.local + to: + kind: Service + name: resource-governance-service + weight: 100 + port: + targetPort: http + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None diff --git a/k8s/service.yaml b/k8s/service.yaml new file mode 100644 index 0000000..0b5fa42 --- /dev/null +++ b/k8s/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: resource-governance-service + namespace: resource-governance + labels: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance +spec: + type: ClusterIP + ports: + - port: 8080 + targetPort: 8080 + protocol: TCP + name: http + selector: + app.kubernetes.io/name: resource-governance + app.kubernetes.io/component: governance diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..bd7c9cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +fastapi==0.104.1 +uvicorn==0.24.0 +kubernetes==28.1.0 +prometheus-client==0.19.0 +requests==2.31.0 +pydantic==2.5.0 +python-multipart==0.0.6 +jinja2==3.1.2 +aiofiles==23.2.1 +pandas==2.1.4 +reportlab==4.0.7 +python-jose[cryptography]==3.3.0 +passlib[bcrypt]==1.7.4 +python-dotenv==1.0.0 diff --git a/scripts/build.sh b/scripts/build.sh new file mode 100755 index 0000000..bb583f2 --- /dev/null +++ b/scripts/build.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Script de build para OpenShift Resource Governance Tool +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configurações +IMAGE_NAME="resource-governance" +TAG="${1:-latest}" +REGISTRY="${2:-quay.io/openshift}" +FULL_IMAGE_NAME="${REGISTRY}/${IMAGE_NAME}:${TAG}" + +echo -e "${BLUE}🚀 Building OpenShift Resource Governance Tool${NC}" +echo -e "${BLUE}Image: ${FULL_IMAGE_NAME}${NC}" + +# Verificar se Docker está rodando +if ! docker info > /dev/null 2>&1; then + echo -e "${RED}❌ Docker não está rodando. Inicie o Docker e tente novamente.${NC}" + exit 1 +fi + +# Build da imagem +echo -e "${YELLOW}📦 Building Docker image...${NC}" +docker build -t "${FULL_IMAGE_NAME}" . + +if [ $? -eq 0 ]; then + echo -e "${GREEN}✅ Image built successfully!${NC}" +else + echo -e "${RED}❌ Build failed!${NC}" + exit 1 +fi + +# Testar a imagem +echo -e "${YELLOW}🧪 Testing image...${NC}" +docker run --rm "${FULL_IMAGE_NAME}" python -c "import app.main; print('✅ App imports successfully')" + +if [ $? -eq 0 ]; then + echo -e "${GREEN}✅ Image test passed!${NC}" +else + echo -e "${RED}❌ Image test failed!${NC}" + exit 1 +fi + +# Mostrar informações da imagem +echo -e "${BLUE}📊 Image information:${NC}" +docker images "${FULL_IMAGE_NAME}" + +echo -e "${GREEN}🎉 Build completed successfully!${NC}" +echo -e "${BLUE}To push to registry:${NC}" +echo -e " docker push ${FULL_IMAGE_NAME}" +echo -e "${BLUE}To run locally:${NC}" +echo -e " docker run -p 8080:8080 ${FULL_IMAGE_NAME}" diff --git a/scripts/deploy.sh b/scripts/deploy.sh new file mode 100755 index 0000000..64ac405 --- /dev/null +++ b/scripts/deploy.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# Script de deploy para OpenShift Resource Governance Tool +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configurações +NAMESPACE="resource-governance" +IMAGE_NAME="resource-governance" +TAG="${1:-latest}" +REGISTRY="${2:-quay.io/openshift}" +FULL_IMAGE_NAME="${REGISTRY}/${IMAGE_NAME}:${TAG}" + +echo -e "${BLUE}🚀 Deploying OpenShift Resource Governance Tool${NC}" +echo -e "${BLUE}Namespace: ${NAMESPACE}${NC}" +echo -e "${BLUE}Image: ${FULL_IMAGE_NAME}${NC}" + +# Verificar se oc está instalado +if ! command -v oc &> /dev/null; then + echo -e "${RED}❌ OpenShift CLI (oc) não está instalado.${NC}" + echo -e "${YELLOW}Instale o oc CLI: https://docs.openshift.com/container-platform/latest/cli_reference/openshift_cli/getting-started-cli.html${NC}" + exit 1 +fi + +# Verificar se está logado no OpenShift +if ! oc whoami &> /dev/null; then + echo -e "${RED}❌ Não está logado no OpenShift.${NC}" + echo -e "${YELLOW}Faça login com: oc login ${NC}" + exit 1 +fi + +echo -e "${GREEN}✅ Logado como: $(oc whoami)${NC}" + +# Criar namespace se não existir +echo -e "${YELLOW}📁 Creating namespace...${NC}" +oc apply -f k8s/namespace.yaml + +# Aplicar RBAC +echo -e "${YELLOW}🔐 Applying RBAC...${NC}" +oc apply -f k8s/rbac.yaml + +# Aplicar ConfigMap +echo -e "${YELLOW}⚙️ Applying ConfigMap...${NC}" +oc apply -f k8s/configmap.yaml + +# Atualizar imagem no DaemonSet +echo -e "${YELLOW}🔄 Updating image in DaemonSet...${NC}" +oc set image daemonset/resource-governance resource-governance="${FULL_IMAGE_NAME}" -n "${NAMESPACE}" + +# Aplicar DaemonSet +echo -e "${YELLOW}📦 Applying DaemonSet...${NC}" +oc apply -f k8s/daemonset.yaml + +# Aplicar Service +echo -e "${YELLOW}🌐 Applying Service...${NC}" +oc apply -f k8s/service.yaml + +# Aplicar Route +echo -e "${YELLOW}🛣️ Applying Route...${NC}" +oc apply -f k8s/route.yaml + +# Aguardar pods ficarem prontos +echo -e "${YELLOW}⏳ Waiting for pods to be ready...${NC}" +oc wait --for=condition=ready pod -l app.kubernetes.io/name=resource-governance -n "${NAMESPACE}" --timeout=300s + +# Obter URL da rota +ROUTE_URL=$(oc get route resource-governance-route -n "${NAMESPACE}" -o jsonpath='{.spec.host}') +if [ -n "${ROUTE_URL}" ]; then + echo -e "${GREEN}🎉 Deploy completed successfully!${NC}" + echo -e "${BLUE}🌐 Application URL: https://${ROUTE_URL}${NC}" +else + echo -e "${YELLOW}⚠️ Deploy completed, but route URL not found.${NC}" + echo -e "${BLUE}Check with: oc get routes -n ${NAMESPACE}${NC}" +fi + +# Mostrar status +echo -e "${BLUE}📊 Deployment status:${NC}" +oc get all -n "${NAMESPACE}" + +echo -e "${BLUE}🔍 To check logs:${NC}" +echo -e " oc logs -f daemonset/resource-governance -n ${NAMESPACE}" + +echo -e "${BLUE}🧪 To test health:${NC}" +echo -e " curl https://${ROUTE_URL}/health" diff --git a/scripts/undeploy.sh b/scripts/undeploy.sh new file mode 100755 index 0000000..72386cb --- /dev/null +++ b/scripts/undeploy.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Script de undeploy para OpenShift Resource Governance Tool +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configurações +NAMESPACE="resource-governance" + +echo -e "${BLUE}🗑️ Undeploying OpenShift Resource Governance Tool${NC}" +echo -e "${BLUE}Namespace: ${NAMESPACE}${NC}" + +# Verificar se oc está instalado +if ! command -v oc &> /dev/null; then + echo -e "${RED}❌ OpenShift CLI (oc) não está instalado.${NC}" + exit 1 +fi + +# Verificar se está logado no OpenShift +if ! oc whoami &> /dev/null; then + echo -e "${RED}❌ Não está logado no OpenShift.${NC}" + echo -e "${YELLOW}Faça login com: oc login ${NC}" + exit 1 +fi + +echo -e "${GREEN}✅ Logado como: $(oc whoami)${NC}" + +# Confirmar remoção +read -p "Tem certeza que deseja remover a aplicação? (y/N): " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}❌ Operação cancelada.${NC}" + exit 0 +fi + +# Remover Route +echo -e "${YELLOW}🛣️ Removing Route...${NC}" +oc delete -f k8s/route.yaml --ignore-not-found=true + +# Remover Service +echo -e "${YELLOW}🌐 Removing Service...${NC}" +oc delete -f k8s/service.yaml --ignore-not-found=true + +# Remover DaemonSet +echo -e "${YELLOW}📦 Removing DaemonSet...${NC}" +oc delete -f k8s/daemonset.yaml --ignore-not-found=true + +# Aguardar pods serem removidos +echo -e "${YELLOW}⏳ Waiting for pods to be terminated...${NC}" +oc wait --for=delete pod -l app.kubernetes.io/name=resource-governance -n "${NAMESPACE}" --timeout=60s || true + +# Remover ConfigMap +echo -e "${YELLOW}⚙️ Removing ConfigMap...${NC}" +oc delete -f k8s/configmap.yaml --ignore-not-found=true + +# Remover RBAC +echo -e "${YELLOW}🔐 Removing RBAC...${NC}" +oc delete -f k8s/rbac.yaml --ignore-not-found=true + +# Remover namespace (opcional) +read -p "Deseja remover o namespace também? (y/N): " -n 1 -r +echo +if [[ $REPLY =~ ^[Yy]$ ]]; then + echo -e "${YELLOW}📁 Removing namespace...${NC}" + oc delete -f k8s/namespace.yaml --ignore-not-found=true + echo -e "${GREEN}✅ Namespace removed.${NC}" +else + echo -e "${YELLOW}⚠️ Namespace mantido.${NC}" +fi + +echo -e "${GREEN}🎉 Undeploy completed successfully!${NC}" + +# Verificar se ainda há recursos +echo -e "${BLUE}🔍 Checking remaining resources:${NC}" +oc get all -n "${NAMESPACE}" 2>/dev/null || echo -e "${GREEN}✅ No resources found in namespace.${NC}" diff --git a/setup.sh b/setup.sh new file mode 100755 index 0000000..4195534 --- /dev/null +++ b/setup.sh @@ -0,0 +1,67 @@ +#!/bin/bash + +# Script de setup para OpenShift Resource Governance Tool +set -e + +# Cores para output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +echo -e "${BLUE}🚀 Setting up OpenShift Resource Governance Tool${NC}" + +# Verificar se Python está instalado +if ! command -v python3 &> /dev/null; then + echo -e "${RED}❌ Python 3 não está instalado.${NC}" + echo -e "${YELLOW}Instale Python 3.11+ e tente novamente.${NC}" + exit 1 +fi + +# Verificar se pip está instalado +if ! command -v pip3 &> /dev/null; then + echo -e "${RED}❌ pip3 não está instalado.${NC}" + echo -e "${YELLOW}Instale pip3 e tente novamente.${NC}" + exit 1 +fi + +# Instalar dependências Python +echo -e "${YELLOW}📦 Installing Python dependencies...${NC}" +pip3 install -r requirements.txt + +# Tornar scripts executáveis +echo -e "${YELLOW}🔧 Making scripts executable...${NC}" +chmod +x scripts/*.sh + +# Criar diretório de relatórios +echo -e "${YELLOW}📁 Creating reports directory...${NC}" +mkdir -p reports + +# Verificar se Docker está instalado +if command -v docker &> /dev/null; then + echo -e "${GREEN}✅ Docker encontrado${NC}" +else + echo -e "${YELLOW}⚠️ Docker não encontrado. Instale para fazer build da imagem.${NC}" +fi + +# Verificar se oc está instalado +if command -v oc &> /dev/null; then + echo -e "${GREEN}✅ OpenShift CLI (oc) encontrado${NC}" +else + echo -e "${YELLOW}⚠️ OpenShift CLI (oc) não encontrado. Instale para fazer deploy.${NC}" +fi + +echo -e "${GREEN}🎉 Setup completed successfully!${NC}" +echo "" +echo -e "${BLUE}Próximos passos:${NC}" +echo -e "1. ${YELLOW}Desenvolvimento local:${NC} make dev" +echo -e "2. ${YELLOW}Build da imagem:${NC} make build" +echo -e "3. ${YELLOW}Deploy no OpenShift:${NC} make deploy" +echo -e "4. ${YELLOW}Ver documentação:${NC} cat README.md" +echo "" +echo -e "${BLUE}Comandos úteis:${NC}" +echo -e " make help - Mostrar todos os comandos" +echo -e " make test - Executar testes" +echo -e " make logs - Ver logs da aplicação" +echo -e " make status - Ver status da aplicação"