一套可直接落地的 **Hyperf 事故复盘与演练平台(工程版)**开源方案,覆盖 从0搭建到持续维护,并给出关键代码骨架(可运行方向)。 ---1)平台目标(工程版 MVP) 先做这8个核心能力:1. 事故登记与状态流转(发现 ->处理中 ->已恢复 ->已复盘)2. 事故时间线(Timeline)自动沉淀3. 复盘报告模板化生成(5 Whys + CAPA)4. 演练计划(定时演练、手动演练)5. 演练执行记录(步骤、结果、评分)6. 行动项(Owner、截止时间、跟踪状态)7. 指标看板(MTTA、MTTR、复发率、演练通过率)8. API + Webhook(对接告警平台/IM) ---2)推荐仓库结构 hyperf-incident-drill/ ├─ app/ │ ├─ Controller/ │ │ ├─ IncidentController.php │ │ ├─ PostmortemController.php │ │ ├─ DrillPlanController.php │ │ └─ MetricsController.php │ ├─ Model/ │ │ ├─ Incident.php │ │ ├─ IncidentTimeline.php │ │ ├─ Postmortem.php │ │ ├─ ActionItem.php │ │ ├─ DrillPlan.php │ │ └─ DrillRun.php │ ├─ Service/ │ │ ├─ IncidentService.php │ │ ├─ PostmortemService.php │ │ ├─ DrillService.php │ │ └─ MetricsService.php │ ├─ Job/ │ │ ├─ DrillRunJob.php │ │ └─ ReminderActionItemJob.php │ ├─ Crontab/ │ │ └─ DrillSchedulerCrontab.php │ └─ Middleware/ │ └─ AuthzMiddleware.php ├─ config/autoload/ │ ├─ routes.php │ ├─ async_queue.php │ ├─ crontab.php │ └─ databases.php ├─ migrations/ ├─ tests/ ├─ docker-compose.yml ├─ .github/workflows/ci.yml ├─ README.md ├─ SECURITY.md └─ LICENSE ---3)从0初始化composercreate-project hyperf/hyperf-skeleton hyperf-incident-drillcdhyperf-incident-drillcomposerrequire hyperf/db-connection hyperf/databasecomposerrequire hyperf/async-queue hyperf/rediscomposerrequire hyperf/crontab hyperf/validationcomposerrequire ramsey/uuid nesbot/carboncomposerrequire--devphpunit/phpunit phpstan/phpstan friendsofphp/php-cs-fixer ---4)数据库设计(关键表)4.1incidents(事故主表) Schema::create('incidents',function(Blueprint$table){$table->bigIncrements('id');$table->string('incident_no',64)->unique();$table->string('title',200);$table->tinyInteger('severity')->default(2);//1~4$table->tinyInteger('status')->default(1);//1open2mitigating3recovered4reviewed$table->string('service',100)->nullable();$table->string('commander',64)->nullable();$table->timestamp('detected_at')->nullable();$table->timestamp('mitigated_at')->nullable();$table->timestamp('resolved_at')->nullable();$table->timestamp('reviewed_at')->nullable();$table->json('tags')->nullable();$table->timestamps();$table->index(['status','severity']);$table->index(['detected_at']);});4.2incident_timelines(时间线) Schema::create('incident_timelines',function(Blueprint$table){$table->bigIncrements('id');$table->unsignedBigInteger('incident_id');$table->timestamp('event_time');$table->string('event_type',50);// alert/mitigation/recovery/note$table->text('content');$table->string('operator',64)->nullable();$table->timestamps();$table->index(['incident_id','event_time']);});4.3postmortems(复盘报告) Schema::create('postmortems',function(Blueprint$table){$table->bigIncrements('id');$table->unsignedBigInteger('incident_id')->unique();$table->text('impact_summary')->nullable();$table->json('root_causes')->nullable();$table->json('five_whys')->nullable();$table->json('lessons')->nullable();$table->tinyInteger('status')->default(1);//1draft2published$table->timestamps();});4.4action_items(改进行动项) Schema::create('action_items',function(Blueprint$table){$table->bigIncrements('id');$table->unsignedBigInteger('incident_id')->nullable();$table->unsignedBigInteger('postmortem_id')->nullable();$table->string('title',200);$table->string('owner',64);$table->date('due_date')->nullable();$table->tinyInteger('priority')->default(2);//1high2med3low$table->tinyInteger('status')->default(1);//1todo2doing3done4overdue$table->timestamps();$table->index(['owner','status']);$table->index(['due_date','status']);});4.5drill_plans / drill_runs(演练计划与执行) Schema::create('drill_plans',function(Blueprint$table){$table->bigIncrements('id');$table->string('name',150);$table->string('target_service',100);$table->string('scenario',200);$table->string('cron_expr',64)->nullable();$table->json('steps');// 演练步骤$table->tinyInteger('enabled')->default(1);$table->timestamp('next_run_at')->nullable();$table->timestamps();});Schema::create('drill_runs',function(Blueprint$table){$table->bigIncrements('id');$table->unsignedBigInteger('plan_id');$table->string('run_no',64)->unique();$table->tinyInteger('status')->default(1);//1running2pass3fail$table->unsignedInteger('score')->default(0);$table->json('result')->nullable();$table->timestamp('started_at')->nullable();$table->timestamp('finished_at')->nullable();$table->timestamps();$table->index(['plan_id','created_at']);});---5)核心代码骨架5.1事故服务(状态流转 + 时间线) app/Service/IncidentService.php<?php declare(strict_types=1);namespace App\Service;use App\Model\Incident;use App\Model\IncidentTimeline;use Hyperf\DbConnection\Db;use Ramsey\Uuid\Uuid;final class IncidentService{publicfunctioncreate(array$data): Incident{returnDb::transaction(function()use($data){$incident=Incident::query()->create(['incident_no'=>'INC-'.date('Ymd').'-'.substr(Uuid::uuid4()->toString(),0,8),'title'=>$data['title'],'severity'=>(int)($data['severity']??2),'status'=>1,'service'=>$data['service']?? null,'commander'=>$data['commander']?? null,'detected_at'=>$data['detected_at']?? date('Y-m-d H:i:s'),'tags'=>$data['tags']??[],]);$this->appendTimeline((int)$incident->id,'alert','incident created',$data['operator']??'system');return $incident;});} public function transition(int $incidentId,int $toStatus,string $operator,string $note=''):void { Db::transaction(function()use($incidentId,$toStatus,$operator,$note){ $incident=Incident::query()->findOrFail($incidentId);$incident->status=$toStatus;if($toStatus===2)$incident->mitigated_at=date('Y-m-d H:i:s');if($toStatus===3)$incident->resolved_at=date('Y-m-d H:i:s');if($toStatus===4)$incident->reviewed_at=date('Y-m-d H:i:s');$incident->save();$this->appendTimeline($incidentId,'status_change',"status->{$toStatus};{$note}",$operator);});} public function appendTimeline(int $incidentId,string $type,string $content,string $operator):void { IncidentTimeline::query()->create([ 'incident_id'=>$incidentId,'event_time'=>date('Y-m-d H:i:s'),'event_type'=>$type,'content'=>$content,'operator'=>$operator,]);} }5.2演练执行 Job(异步) app/Job/DrillRunJob.php<?php declare(strict_types=1);namespace App\Job;use App\Model\DrillPlan;use App\Model\DrillRun;use Hyperf\AsyncQueue\Job;use Ramsey\Uuid\Uuid;final class DrillRunJob extends Job { public function __construct(public int $planId){} public function handle():void { $plan=DrillPlan::query()->findOrFail($this->planId);$run=DrillRun::query()->create([ 'plan_id'=>$plan->id,'run_no'=>'DR-' . date('YmdHis'). '-' . substr(Uuid::uuid4()->toString(),0,6),'status'=>1,'started_at'=>date('Y-m-d H:i:s'),]);$steps=$plan->steps??[];$result=[];$score=100;$failed=false;foreach($steps as $idx=>$step){ try {//这里替换为真实演练动作(调用压测平台/故障注入接口/脚本网关) $result[]=['step'=>$idx+1,'name'=>$step['name']??'step','ok'=>true];} catch(\Throwable $e){ $failed=true;$score-=20;$result[]=['step'=>$idx+1,'name'=>$step['name']??'step','ok'=>false,'error'=>$e->getMessage()];} } $run->status=$failed?3:2;$run->score=max(0,$score);$run->result=$result;$run->finished_at=date('Y-m-d H:i:s');$run->save();} }5.3定时调度演练(每分钟扫描) app/Crontab/DrillSchedulerCrontab.php<?php declare(strict_types=1);namespace App\Crontab;use App\Model\DrillPlan;use App\Job\DrillRunJob;use Hyperf\AsyncQueue\Driver\DriverFactory;use Hyperf\Crontab\Annotation\Crontab;use Hyperf\DbConnection\Db;use Cron\CronExpression;final class DrillSchedulerCrontab { public function __construct(private DriverFactory $driverFactory){} #[Crontab(rule:'*****',memo:'schedule drill plans',singleton:true)] public function execute():void { Db::transaction(function(){ $plans=DrillPlan::query()->where('enabled',1)->lockForUpdate()->get();foreach($plans as $plan){ if(!$plan->cron_expr)continue;if($plan->next_run_at&&strtotime((string)$plan->next_run_at)>time())continue;$this->driverFactory->get('default')->push(new DrillRunJob((int)$plan->id));$plan->next_run_at=CronExpression::factory($plan->cron_expr)->getNextRunDate()->format('Y-m-d H:i:s');$plan->save();}});}}5.4指标服务(MTTA / MTTR) app/Service/MetricsService.php<?php declare(strict_types=1);namespace App\Service;use App\Model\Incident;use Carbon\Carbon;final class MetricsService{publicfunctionsummary(): array{$incidents=Incident::query()->whereNotNull('detected_at')->get();$mtta=[];$mttr=[];foreach($incidentsas$i){if($i->mitigated_at){$mtta[]=Carbon::parse($i->detected_at)->diffInMinutes(Carbon::parse($i->mitigated_at));}if($i->resolved_at){$mttr[]=Carbon::parse($i->detected_at)->diffInMinutes(Carbon::parse($i->resolved_at));}}return['incident_total'=>$incidents->count(),'mtta_min'=>count($mtta)? round(array_sum($mtta)/ count($mtta),2):null,'mttr_min'=>count($mttr)? round(array_sum($mttr)/ count($mttr),2):null,];}}---6)API 路由(最小闭环) config/autoload/routes.php<?php use Hyperf\HttpServer\Router\Router;use App\Controller\IncidentController;use App\Controller\PostmortemController;use App\Controller\DrillPlanController;use App\Controller\MetricsController;Router::addGroup('/api',function(){Router::post('/incidents',[IncidentController::class,'create']);Router::post('/incidents/{id:\d+}/transition',[IncidentController::class,'transition']);Router::post('/incidents/{id:\d+}/timeline',[IncidentController::class,'appendTimeline']);Router::post('/incidents/{id:\d+}/postmortem',[PostmortemController::class,'upsert']);Router::post('/drill/plans',[DrillPlanController::class,'create']);Router::post('/drill/plans/{id:\d+}/run',[DrillPlanController::class,'runNow']);Router::get('/metrics/summary',[MetricsController::class,'summary']);});---7)本地运行与基础设施 docker-compose.yml(最小) version:"3.8"services: mysql: image: mysql:8.0 environment: MYSQL_ROOT_PASSWORD: root MYSQL_DATABASE: incident ports:["3306:3306"]redis: image: redis:7 ports:["6379:6379"]---8)CI/CD(开源必须) .github/workflows/ci.yml 至少包含: -composervalidate - php-cs-fixer --dry-run - phpstan analyse - phpunit - 集成测试:创建 incident ->状态流转 ->生成 postmortem ->触发演练 run ---9)开源发布完整流程1. LICENSE:MIT / Apache-2.02. README.md:快速启动、状态机、API 示例、架构图3. SECURITY.md:漏洞提交通道和 SLA4. Issue/PR 模板5. 首版标签:v0.1.06. 发布后维护 CHANGELOG.md(Breaking Change 要写迁移步骤) ---10)持续维护路线图(建议) - v0.1:事故、时间线、复盘、演练、指标闭环 - v0.2:RBAC、多租户、Webhook(飞书/Slack/钉钉) - v0.3:演练评分模型(RTO/RPO/响应协作得分) - v1.0:审计日志、SSO、插件化演练动作(K8s/DB/Cache) ---11)工程版最容易踩坑的点1. 演练任务与生产任务混跑,缺少资源隔离2. 时间线依赖人工补录,导致复盘不完整3. MTTR 口径不统一(检测时间/恢复时间定义混乱)4. Action Item 没有逾期提醒和闭环追踪5. 演练“只执行不复盘”,无法沉淀改进资产 --- 这套骨架已经能作为开源首版:先上线 Incident + Timeline + Postmortem + Drill + Metrics 主链路,再迭代权限、通知和插件能力。