当前位置：首页 > news >正文

【DevOps】CI/CD最佳实践：从自动化构建到持续部署

news 2026/8/3 6:06:43

【DevOps】CI/CD最佳实践：从自动化构建到持续部署

引言

在当今快速迭代的软件开发环境中，DevOps已经成为提升开发效率和质量的关键实践。作为一名有着十余年开发经验的程序员，我亲眼见证了从传统的"开发-测试-部署"手动流程，到如今全自动化的CI/CD流水线的演变。这个转变不仅提升了交付效率，更重要的是，它让软件质量更加稳定，让开发者能够更专注于业务价值的实现。

很多团队在实施CI/CD时会遇到各种挑战：构建时间过长、测试覆盖率不足、部署风险高、环境不一致等等。这些问题我相信大家都曾遇到过。今天，我将结合自己多年的实践经验，系统性地分享CI/CD最佳实践，希望能帮助正在建设或优化CI/CD系统的团队少走弯路。

本文将从多个维度展开讨论，包括构建系统设计、测试策略、部署流水线、监控告警等核心环节。每个环节都会配合实际的代码示例，帮助大家更好地理解和应用。

一、CI/CD核心架构设计

1.1 流水线框架设计

一个设计良好的CI/CD流水线框架是整个自动化体系的基础。我将分享一个生产级别的流水线框架设计。

from abc import ABC, abstractmethod from dataclasses import dataclass, field from typing import List, Dict, Any, Optional, Callable from datetime import datetime from enum import Enum import json import hashlib class StageStatus(Enum): """流水线阶段状态""" PENDING = "pending" RUNNING = "running" SUCCESS = "success" FAILED = "failed" SKIPPED = "skipped" CANCELLED = "cancelled" class PipelineEvent(Enum): """流水线事件类型""" STAGE_STARTED = "stage_started" STAGE_COMPLETED = "stage_completed" STAGE_FAILED = "stage_failed" PIPELINE_STARTED = "pipeline_started" PIPELINE_COMPLETED = "pipeline_completed" PIPELINE_FAILED = "pipeline_failed" @dataclass class Stage: """流水线阶段""" name: str steps: List['Step'] timeout: int = 3600 # 默认1小时超时 continue_on_failure: bool = False retry_count: int = 0 condition: Optional[Callable[[], bool]] = None @dataclass class Step: """流水线步骤""" name: str command: str working_directory: Optional[str] = None env_vars: Dict[str, str] = field(default_factory=dict) timeout: int = 1800 # 默认30分钟 retry_count: int = 0 retry_delay: int = 60 # 重试间隔（秒） @dataclass class PipelineExecution: """流水线执行记录""" pipeline_id: str execution_id: str status: StageStatus start_time: datetime end_time: Optional[datetime] = None stages: List[Dict[str, Any]] = field(default_factory=list) artifacts: Dict[str, str] = field(default_factory=dict) metadata: Dict[str, Any] = field(default_factory=dict) class PipelineRunner: """流水线运行器""" def __init__(self, pipeline: 'Pipeline', notifier: 'PipelineNotifier'): self.pipeline = pipeline self.notifier = notifier self.current_execution: Optional[PipelineExecution] = None self.listeners: List[Callable] = [] def execute(self, trigger_context: Dict[str, Any]) -> PipelineExecution: """执行流水线""" execution_id = self._generate_execution_id() self.current_execution = PipelineExecution( pipeline_id=self.pipeline.id, execution_id=execution_id, status=StageStatus.RUNNING, start_time=datetime.now(), metadata=trigger_context ) self._emit_event(PipelineEvent.PIPELINE_STARTED, self.current_execution) try: for stage in self.pipeline.stages: # 检查阶段执行条件 if stage.condition and not stage.condition(): self._skip_stage(stage) continue # 执行阶段 stage_result = self._execute_stage(stage) if not stage_result: if not stage.continue_on_failure: self._fail_pipeline(f"Stage {stage.name} failed") break else: self._complete_stage(stage, stage_result) if self._all_stages_passed(): self._complete_pipeline() else: self._fail_pipeline("Pipeline failed due to stage failures") except Exception as e: self._fail_pipeline(f"Pipeline failed with exception: {str(e)}") return self.current_execution def _execute_stage(self, stage: Stage) -> bool: """执行单个阶段""" self._emit_event(PipelineEvent.STAGE_STARTED, stage) stage_record = { 'name': stage.name, 'status': StageStatus.RUNNING, 'start_time': datetime.now(), 'steps': [] } self.current_execution.stages.append(stage_record) for step in stage.steps: step_result = self._execute_step(step) if not step_result: stage_record['status'] = StageStatus.FAILED self._emit_event(PipelineEvent.STAGE_FAILED, (stage, step)) return False stage_record['steps'].append(step_result) stage_record['status'] = StageStatus.SUCCESS stage_record['end_time'] = datetime.now() self._emit_event(PipelineEvent.STAGE_COMPLETED, stage) return True def _execute_step(self, step: Step) -> Optional[Dict]: """执行单个步骤""" retry_count = 0 last_error = None while retry_count <= step.retry_count: try: result = self._run_command(step) return { 'name': step.name, 'status': StageStatus.SUCCESS, 'output': result, 'duration': 0 # 简化 } except Exception as e: last_error = e retry_count += 1 if retry_count <= step.retry_count: import time time.sleep(step.retry_delay) return None def _run_command(self, step: Step) -> str: """运行命令（实际实现中会调用shell或容器）""" # 这里是简化的实现 # 实际实现需要考虑容器执行、SSH执行等 pass def _skip_stage(self, stage: Stage): """跳过阶段""" stage_record = { 'name': stage.name, 'status': StageStatus.SKIPPED, 'skipped_at': datetime.now() } self.current_execution.stages.append(stage_record) def _complete_pipeline(self): """完成流水线""" self.current_execution.status = StageStatus.SUCCESS self.current_execution.end_time = datetime.now() self._emit_event(PipelineEvent.PIPELINE_COMPLETED, self.current_execution) self.notifier.notify(self.current_execution) def _fail_pipeline(self, reason: str): """流水线失败""" self.current_execution.status = StageStatus.FAILED self.current_execution.end_time = datetime.now() self.current_execution.metadata['failure_reason'] = reason self._emit_event(PipelineEvent.PIPELINE_FAILED, self.current_execution) self.notifier.notify(self.current_execution) def _all_stages_passed(self) -> bool: """检查所有阶段是否通过""" return all( s.get('status') == StageStatus.SUCCESS for s in self.current_execution.stages ) def _generate_execution_id(self) -> str: """生成执行ID""" timestamp = datetime.now().isoformat() return hashlib.md5(timestamp.encode()).hexdigest()[:12] def _emit_event(self, event: PipelineEvent, data: Any): """发送事件""" for listener in self.listeners: listener(event, data) def add_listener(self, listener: Callable): """添加事件监听器""" self.listeners.append(listener) class Pipeline: """流水线定义""" def __init__(self, pipeline_id: str, name: str): self.id = pipeline_id self.name = name self.stages: List[Stage] = [] self.environment = 'production' def add_stage(self, stage: Stage): """添加阶段""" self.stages.append(stage) return self def on(self, trigger: str): """设置触发器""" # 设置触发条件 pass

1.2 构建系统实现

# .gitlab-ci.yml 示例 # GitLab CI配置文件 stages: - build - test - analyze - deploy variables: DOCKER_IMAGE: registry.example.com/app DOCKER_TAG: $CI_COMMIT_SHORT_SHA # 构建阶段 build: stage: build image: docker:20.10.16 services: - docker:20.10.16-dind script: - docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASSWORD $CI_REGISTRY - docker build -t $DOCKER_IMAGE:$DOCKER_TAG . - docker push $DOCKER_IMAGE:$DOCKER_TAG artifacts: paths: - build/ expire_in: 1 week only: - main - develop # 单元测试阶段 test:unit: stage: test image: node:18-alpine script: - npm ci - npm run test:unit -- --coverage coverage: '/Lines\s*:\s*(\d+\.\d+)%/' artifacts: reports: junit: junit.xml coverage_report: coverage_format: cobertura path: coverage/cobertura-coverage.xml only: - main - develop - merge_requests # 集成测试阶段 test:integration: stage: test image: node:18-alpine services: - postgres:14 - redis:7 variables: POSTGRES_DB: test_db POSTGRES_USER: test_user POSTGRES_PASSWORD: test_password REDIS_URL: redis://redis:6379/0 script: - npm ci - npm run test:integration dependencies: - build # E2E测试阶段 test:e2e: stage: test image: cypress/base:16 services: - docker:20.10.16-dind script: - npm run build - npm run start:preview & - sleep 10 - npm run test:e2e artifacts: when: always paths: - cypress/videos/ - cypress/screenshots/ allow_failure: true # 允许失败，不阻塞部署 # 代码质量分析 analyze:security: stage: analyze image: aquasec/trivy:latest script: - trivy image --exit-code 0 --severity HIGH,CRITICAL $DOCKER_IMAGE:$DOCKER_TAG allow_failure: true # 生产部署 deploy:production: stage: deploy image: bitnami/kubectl:latest environment: name: production url: https://app.example.com script: - kubectl set image deployment/app app=$DOCKER_IMAGE:$DOCKER_TAG - kubectl rollout status deployment/app --timeout=300s - kubectl rollout history deployment/app when: manual only: - main retry: max: 2 when: - runner_system_failure - stuck_or_timeout_failure

二、测试策略与实践

2.1 测试金字塔

一个健康的测试策略应该遵循测试金字塔原则：底层是大量的单元测试，中间层是集成测试，顶层是少量的端到端测试。

import unittest from abc import ABC, abstractmethod from typing import Dict, List, Any, Optional from dataclasses import dataclass import time @dataclass class TestResult: """测试结果""" name: str passed: bool duration: float error_message: Optional[str] = None retry_count: int = 0 class TestSuite: """测试套件""" def __init__(self, name: str): self.name = name self.tests: List['TestCase'] = [] self.results: List[TestResult] = [] def add_test(self, test: 'TestCase'): """添加测试用例""" self.tests.append(test) def run(self, parallel: bool = True, workers: int = 4) -> Dict[str, Any]: """运行测试套件""" start_time = time.time() if parallel: results = self._run_parallel(workers) else: results = self._run_sequential() duration = time.time() - start_time return { 'suite_name': self.name, 'total_tests': len(self.tests), 'passed': sum(1 for r in results if r.passed), 'failed': sum(1 for r in results if not r.passed), 'duration': duration, 'results': results } def _run_sequential(self) -> List[TestResult]: """顺序执行""" results = [] for test in self.tests: result = test.run() results.append(result) return results def _run_parallel(self, workers: int) -> List[TestResult]: """并行执行""" from concurrent.futures import ThreadPoolExecutor, as_completed results = [] with ThreadPoolExecutor(max_workers=workers) as executor: futures = {executor.submit(test.run): test for test in self.tests} for future in as_completed(futures): results.append(future.result()) return results class TestCase(ABC): """测试用例基类""" def __init__(self, name: str): self.name = name self.retry_count = 0 self.max_retries = 2 @abstractmethod def setup(self): """测试前置准备""" pass @abstractmethod def teardown(self): """测试后置清理""" pass @abstractmethod def execute(self) -> bool: """执行测试逻辑""" pass def run(self) -> TestResult: """运行测试用例""" start_time = time.time() for attempt in range(self.max_retries + 1): try: self.setup() success = self.execute() self.teardown() return TestResult( name=self.name, passed=success, duration=time.time() - start_time, retry_count=attempt ) except Exception as e: self.teardown() if attempt == self.max_retries: return TestResult( name=self.name, passed=False, duration=time.time() - start_time, error_message=str(e), retry_count=attempt + 1 ) time.sleep(1) # 重试前等待 return TestResult( name=self.name, passed=False, duration=time.time() - start_time, retry_count=self.max_retries ) class UnitTestSuite(TestSuite): """单元测试套件""" def __init__(self): super().__init__("Unit Tests") self.coverage_threshold = 80.0 # 覆盖率阈值 def validate_coverage(self, coverage_report: Dict) -> bool: """验证覆盖率""" total_coverage = coverage_report.get('total', {}).get('percent_covered', 0) return total_coverage >= self.coverage_threshold class IntegrationTestSuite(TestSuite): """集成测试套件""" def __init__(self): super().__init__("Integration Tests") self.required_services = ['postgres', 'redis', 'kafka']

2.2 端到端测试框架

import pytest from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options from dataclasses import dataclass from typing import Optional, Dict import logging @dataclass class BrowserConfig: """浏览器配置""" browser: str = 'chrome' headless: bool = True window_size: str = '1920,1080' page_load_timeout: int = 30 implicit_wait: int = 10 screenshots_on_failure: bool = True videos_on_failure: bool = True class E2ETestFramework: """端到端测试框架""" def __init__(self, config: BrowserConfig): self.config = config self.driver: Optional[webdriver.Remote] = None self.logger = logging.getLogger(__name__) self.screenshots_dir = 'cypress/screenshots' self.videos_dir = 'cypress/videos' def setup(self): """设置测试环境""" options = Options() if self.config.headless: options.add_argument('--headless') options.add_argument(f'--window-size={self.config.window_size}') options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') self.driver = webdriver.Chrome(options=options) self.driver.set_page_load_timeout(self.config.page_load_timeout) self.driver.implicitly_wait(self.config.implicit_wait) def teardown(self): """清理测试环境""" if self.driver: self.driver.quit() def take_screenshot(self, name: str): """截图""" if self.driver and self.config.screenshots_on_failure: self.driver.save_screenshot( f'{self.screenshots_dir}/{name}.png' ) def navigate_to(self, url: str): """导航到URL""" self.driver.get(url) WebDriverWait(self.driver, self.config.page_load_timeout).until( EC.page_loaded() ) def find_element(self, locator: tuple, timeout: int = 10): """查找元素""" return WebDriverWait(self.driver, timeout).until( EC.presence_of_element_located(locator) ) def click(self, locator: tuple): """点击元素""" element = self.find_element(locator) element.click() def input_text(self, locator: tuple, text: str): """输入文本""" element = self.find_element(locator) element.clear() element.send_keys(text) def assert_text(self, locator: tuple, expected_text: str): """断言文本""" element = self.find_element(locator) actual_text = element.text assert actual_text == expected_text, \ f"Expected '{expected_text}', but got '{actual_text}'" def assert_element_visible(self, locator: tuple): """断言元素可见""" element = WebDriverWait(self.driver, 10).until( EC.visibility_of_element_located(locator) ) assert element.is_displayed(), f"Element {locator} is not visible" # 示例测试用例 class TestLoginPage: """登录页面测试""" @pytest.fixture(autouse=True) def setup_method(self): """每个测试方法的前置准备""" self.framework = E2ETestFramework(BrowserConfig()) self.framework.setup() yield self.framework.teardown() def test_login_success(self): """测试成功登录""" # 导航到登录页 self.framework.navigate_to('https://app.example.com/login') # 输入用户名密码 self.framework.input_text( (By.ID, 'username'), 'testuser@example.com' ) self.framework.input_text( (By.ID, 'password'), 'password123' ) # 点击登录按钮 self.framework.click((By.ID, 'login-button')) # 等待跳转到首页 WebDriverWait(self.framework.driver, 10).until( EC.url_to_be('https://app.example.com/dashboard') ) # 验证登录成功 self.framework.assert_element_visible( (By.XPATH, '//div[@class="user-avatar"]') ) def test_login_invalid_credentials(self): """测试无效凭据登录""" self.framework.navigate_to('https://app.example.com/login') self.framework.input_text( (By.ID, 'username'), 'invalid@example.com' ) self.framework.input_text( (By.ID, 'password'), 'wrongpassword' ) self.framework.click((By.ID, 'login-button')) # 验证错误提示 self.framework.assert_element_visible( (By.XPATH, '//div[@class="error-message"]') ) self.framework.assert_text( (By.XPATH, '//div[@class="error-message"]'), 'Invalid username or password' )

三、部署策略与实践

3.1 蓝绿部署

蓝绿部署是一种零停机部署策略，通过维护两套相同的环境来实现无缝切换。

import kubectl from dataclasses import dataclass from typing import Dict, List, Optional from datetime import datetime import time @dataclass class DeploymentConfig: """部署配置""" namespace: str blue_green: bool = True health_check_path: str = '/health' health_check_timeout: int = 60 rollout_timeout: int = 300 class BlueGreenDeployer: """蓝绿部署器""" def __init__(self, config: DeploymentConfig): self.config = config self.current_color = 'blue' self.inactive_color = 'green' def deploy(self, image_tag: str, replicas: int = 3) -> bool: """执行蓝绿部署""" try: # 1. 部署新版本到非活跃环境 self._deploy_to_environment(self.inactive_color, image_tag, replicas) # 2. 等待新版本就绪 if not self._wait_for_ready(self.inactive_color): raise RuntimeError(f"Deployment to {self.inactive_color} failed") # 3. 执行流量切换 self._switch_traffic() # 4. 验证新版本 if not self._validate_deployment(): self._rollback() return False # 5. 更新活跃环境标签（下次部署时） self._swap_colors() return True except Exception as e: self._cleanup_failed_deployment() raise e def _deploy_to_environment(self, color: str, image_tag: str, replicas: int): """部署到指定颜色环境""" deployment_name = f"app-{color}" # 创建或更新部署 kubectl.apply({ 'apiVersion': 'apps/v1', 'kind': 'Deployment', 'metadata': { 'name': deployment_name, 'namespace': self.config.namespace }, 'spec': { 'replicas': replicas, 'selector': { 'matchLabels': { 'app': 'myapp', 'color': color } }, 'template': { 'metadata': { 'labels': { 'app': 'myapp', 'color': color, 'version': image_tag } }, 'spec': { 'containers': [{ 'name': 'app', 'image': f'registry.example.com/app:{image_tag}', 'ports': [{'containerPort': 8080}], 'livenessProbe': { 'httpGet': { 'path': self.config.health_check_path, 'port': 8080 }, 'initialDelaySeconds': 10, 'periodSeconds': 5 }, 'readinessProbe': { 'httpGet': { 'path': '/ready', 'port': 8080 }, 'initialDelaySeconds': 5, 'periodSeconds': 3 } }] } } } }) print(f"Deployed {image_tag} to {color} environment") def _wait_for_ready(self, color: str, timeout: int = 300) -> bool: """等待环境就绪""" deployment_name = f"app-{color}" start_time = time.time() while time.time() - start_time < timeout: status = kubectl.get_deployment_status( deployment_name, self.config.namespace ) if (status.available_replicas == status.replicas and status.available_replicas > 0): return True time.sleep(5) return False def _switch_traffic(self): """切换流量""" service_name = 'app-service' # 更新Service选择器指向新版本 kubectl.patch( 'service', service_name, self.config.namespace, { 'spec': { 'selector': { 'app': 'myapp', 'color': self.inactive_color } } } ) print(f"Traffic switched to {self.inactive_color} environment") def _validate_deployment(self) -> bool: """验证部署""" # 等待一段时间让流量稳定 time.sleep(10) # 检查错误率和响应时间 metrics = self._get_deployment_metrics() error_rate = metrics.get('error_rate', 100) if error_rate > 0.01: # 1%错误率阈值 print(f"Error rate too high: {error_rate}") return False return True def _rollback(self): """回滚""" # 切换回原来的环境 self._switch_traffic() # 删除失败的环境 self._cleanup_environment(self.inactive_color) def _swap_colors(self): """交换颜色""" self.current_color, self.inactive_color = \ self.inactive_color, self.current_color def _get_deployment_metrics(self) -> Dict: """获取部署指标""" # 实际实现中从监控系统中获取 return {'error_rate': 0.0, 'p99_latency': 100} def _cleanup_environment(self, color: str): """清理环境""" deployment_name = f"app-{color}" kubectl.delete_deployment(deployment_name, self.config.namespace) def _cleanup_failed_deployment(self): """清理失败部署""" self._cleanup_environment(self.inactive_color)

3.2 金丝雀发布

金丝雀发布是一种渐进式发布策略，先将小部分流量切换到新版本，验证稳定后再逐步扩大比例。

from typing import Dict, List, Callable import random import time @dataclass class CanaryConfig: """金丝雀发布配置""" initial_weight: int = 5 # 初始流量比例（5%） increment: int = 20 # 每次增加的比例 increment_interval: int = 300 # 增加间隔（秒） max_weight: int = 100 # 最大比例 analysis_window: int = 600 # 分析窗口（秒） error_threshold: float = 0.01 # 错误率阈值 latency_threshold_ms: int = 500 # 延迟阈值 class CanaryRelease: """金丝雀发布管理器""" def __init__(self, config: CanaryConfig): self.config = config self.current_weight = 0 self.metrics_collector = MetricsCollector() def deploy(self, new_version: str) -> bool: """执行金丝雀发布""" print(f"Starting canary release for version {new_version}") # 1. 部署新版本（不承载流量） self._deploy_new_version(new_version, weight=0) # 2. 初始化流量权重 self.current_weight = self.config.initial_weight # 3. 渐进式增加流量 while self.current_weight <= self.config.max_weight: print(f"\nIncreasing weight to {self.current_weight}%") # 更新流量权重 self._update_weight(self.current_weight) # 等待稳定 time.sleep(60) # 等待流量稳定 # 分析指标 if self._analyze_metrics(): print("Metrics look good, proceeding...") else: print("Metrics degraded, rolling back!") self._rollback() return False # 增加权重 if self.current_weight < self.config.max_weight: self.current_weight = min( self.current_weight + self.config.increment, self.config.max_weight ) time.sleep(self.config.increment_interval) # 4. 全量切换 print("All traffic to new version") self._full_rollout(new_version) return True def _deploy_new_version(self, version: str, weight: int): """部署新版本""" # 创建金丝雀部署 pass def _update_weight(self, weight: int): """更新流量权重""" # 更新Istio VirtualService或其他服务网格配置 pass def _analyze_metrics(self) -> bool: """分析指标""" metrics = self.metrics_collector.get_metrics( window=self.config.analysis_window ) # 检查错误率 if metrics.error_rate > self.config.error_threshold: print(f"Error rate {metrics.error_rate} exceeds threshold") return False # 检查延迟 if metrics.p99_latency > self.config.latency_threshold_ms: print(f"P99 latency {metrics.p99_latency}ms exceeds threshold") return False # 检查业务指标 if metrics.conversion_rate_degradation > 0.05: print("Conversion rate significantly degraded") return False return True def _rollback(self): """回滚""" self._update_weight(0) self._cleanup_new_version() def _full_rollout(self, version: str): """全量发布""" self._update_weight(100) # 更新主要部署版本标签 def _cleanup_new_version(self): """清理新版本""" pass class MetricsCollector: """指标收集器""" def get_metrics(self, window: int) -> 'Metrics': """获取指标""" # 从Prometheus等监控系统获取 pass @dataclass class Metrics: """指标数据""" error_rate: float p50_latency: float p99_latency: float request_count: int conversion_rate: float conversion_rate_degradation: float

四、监控与反馈

4.1 部署监控仪表板

import prometheus_client as prom from prometheus_client import Counter, Histogram, Gauge, Summary # 定义指标 DEPLOYMENT_COUNT = Counter( 'deployments_total', 'Total number of deployments', ['app', 'environment', 'status'] ) DEPLOYMENT_DURATION = Histogram( 'deployment_duration_seconds', 'Deployment duration in seconds', ['app', 'stage'] ) DEPLOYMENT_WEIGHT = Gauge( 'deployment_canary_weight', 'Current canary deployment weight', ['app'] ) HEALTH_CHECK_STATUS = Gauge( 'health_check_status', 'Health check status (1=healthy, 0=unhealthy)', ['app', 'instance'] ) ROLLBACK_COUNT = Counter( 'rollbacks_total', 'Total number of rollbacks', ['app', 'reason'] ) class DeploymentMonitor: """部署监控器""" def __init__(self, app_name: str): self.app_name = app_name self.start_http_server(9090) # 暴露监控指标 def record_deployment(self, environment: str, status: str): """记录部署事件""" DEPLOYMENT_COUNT.labels( app=self.app_name, environment=environment, status=status ).inc() def record_rollback(self, reason: str): """记录回滚事件""" ROLLBACK_COUNT.labels( app=self.app_name, reason=reason ).inc() def record_canary_weight(self, weight: int): """记录金丝雀权重""" DEPLOYMENT_WEIGHT.labels( app=self.app_name ).set(weight) def observe_duration(self, stage: str, duration: float): """记录部署持续时间""" DEPLOYMENT_DURATION.labels( app=self.app_name, stage=stage ).observe(duration) def update_health_status(self, instance: str, healthy: bool): """更新健康状态""" HEALTH_CHECK_STATUS.labels( app=self.app_name, instance=instance ).set(1 if healthy else 0)

4.2 自动化告警系统

# alertmanager.yml 配置 global: smtp_smarthost: 'smtp.example.com:587' smtp_from: 'alerts@example.com' smtp_auth_username: 'alerts@example.com' smtp_auth_password: 'password' route: group_by: ['alertname', 'severity'] group_wait: 30s group_interval: 5m repeat_interval: 4h receiver: 'team-notifications' routes: - match: severity: critical receiver: 'pagerduty' continue: true - match: component: deployment receiver: 'deployment-alerts' receivers: - name: 'team-notifications' email_configs: - to: 'team@example.com' headers: subject: '{{ template "email.subject" . }}' - name: 'pagerduty' pagerduty_configs: - service_key: 'YOUR_PAGERDUTY_KEY' severity: critical - name: 'deployment-alerts' webhook_configs: - url: 'http://alert-service:8080/webhook'

五、环境管理与配置

5.1 多环境配置

from typing import Dict, Any from dataclasses import dataclass import os @dataclass class Environment: """环境配置""" name: str base_url: str database_url: str redis_url: str smtp_config: Dict[str, str] feature_flags: Dict[str, bool] resource_limits: Dict[str, Dict] class EnvironmentManager: """环境管理器""" ENVIRONMENTS = { 'development': Environment( name='development', base_url='http://localhost:3000', database_url='postgresql://localhost:5432/dev_db', redis_url='redis://localhost:6379/0', smtp_config={ 'host': 'localhost', 'port': 1025, 'from': 'dev@example.com' }, feature_flags={ 'enable_cache': False, 'enable_analytics': False, 'debug_mode': True }, resource_limits={ 'cpu': '500m', 'memory': '512Mi' } ), 'staging': Environment( name='staging', base_url='https://staging.example.com', database_url=os.environ['STAGING_DB_URL'], redis_url=os.environ['STAGING_REDIS_URL'], smtp_config={ 'host': 'smtp.example.com', 'port': 587, 'from': 'staging@example.com' }, feature_flags={ 'enable_cache': True, 'enable_analytics': True, 'debug_mode': False }, resource_limits={ 'cpu': '1000m', 'memory': '1Gi' } ), 'production': Environment( name='production', base_url='https://app.example.com', database_url=os.environ['PROD_DB_URL'], redis_url=os.environ['PROD_REDIS_URL'], smtp_config={ 'host': 'smtp.example.com', 'port': 587, 'from': 'noreply@example.com' }, feature_flags={ 'enable_cache': True, 'enable_analytics': True, 'debug_mode': False }, resource_limits={ 'cpu': '2000m', 'memory': '2Gi' } ) } @classmethod def get_environment(cls, env_name: str) -> Environment: """获取指定环境配置""" if env_name not in cls.ENVIRONMENTS: raise ValueError(f"Unknown environment: {env_name}") return cls.ENVIRONMENTS[env_name] @classmethod def get_config_for_deployment(cls, env_name: str, version: str) -> Dict[str, Any]: """获取部署配置""" env = cls.get_environment(env_name) return { 'environment': env.name, 'image_tag': version, 'database_url': env.database_url, 'redis_url': env.redis_url, 'config': { 'base_url': env.base_url, 'smtp': env.smtp_config, 'feature_flags': env.feature_flags }, 'resources': env.resource_limits }