orchestrator 安装？参数？VIP脚本？

最新推荐文章于 2025-10-16 16:48:04 发布

原创最新推荐文章于 2025-10-16 16:48:04 发布 · 1.5k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#orchestrator #MySQL

MySQL 专栏收录该内容

15 篇文章

订阅专栏

本文详细介绍如何在操作系统7上部署Orchestrator，包括安装准备、具体步骤、配置文件详解及启动流程。同时提供了相关参考资料。

1. 准备

操作系统7（因为要求glibc版本）
vim /etc/hosts （所有机器都要写，都要编辑）
iptables 、 selinux 都要关闭
下载地址：https://github.com/github/orchestrator/releases/

2. 安装

安装要求：
操作系统7（因为要求glibc版本>=2.14）
关闭iptables或开通orch所使用的端口和 selinux
vim /etc/hosts : ip host

1.安装orch（orchestrator、orchestrator-client）

rpm -ivh orch*
安装依赖包jq-1.5、oniguruma-5.9.5

2.因orch依赖数据库，这里采用MySQL数据库：orch与DB是1:1的关系

安装MySQL（略）
在orch后端数据库：
CREATE DATABASE IF NOT EXISTS orchestrator;
CREATE USER ‘orchestrator’@’%’ IDENTIFIED BY ‘123123’;
GRANT ALL PRIVILEGES ON orchestrator.* TO ‘orchestrator’@’%’;

3.安装orch管理的数据库集群上（这儿是一主两从，也可以是双master）

CREATE USER ‘orchestrator’@’%’ IDENTIFIED BY ‘123123’;
GRANT SUPER, PROCESS, REPLICATION SLAVE, RELOAD ON . TO ‘orchestrator’@’%’;
GRANT SELECT ON mysql.slave_master_info TO ‘orchestrator’@’%’;
CREATE DATABASE IF NOT EXISTS meta;
GRANT SELECT ON meta.* TO ‘orchestrator’@’%’;

数据库my.cnf 要加上 report_host= 、 report_port= log_slave_updates=1

4.修改配置文件orchestrator.conf.json

模板：

{
  "Debug": true,           #debug模式，输出详细信息   
  "EnableSyslog": false,   #是否输出到系统日志里
  "ListenAddress": ":3000",#orch监听的端口，web端口
  "MySQLTopologyUser": "orchestrator",      #后端被管理的集群账号：所有实例都要有
  "MySQLTopologyPassword": "123123",        #后端被管理的集群密码
  "MySQLTopologyCredentialsConfigFile": "", #后端集群的用户密码配置文件，账号密码可以直接写入文件，读取
  "MySQLTopologySSLPrivateKeyFile": "",     #SSL相关
  "MySQLTopologySSLCertFile": "",
  "MySQLTopologySSLCAFile": "",
  "MySQLTopologySSLSkipVerify": true,      #跳过SSL验证
  "MySQLTopologyUseMutualTLS": false,      #是否使用TLS验证
  "MySQLOrchestratorHost": "192.168.192.30",    #orch使用的数据库所在主机
  "MySQLOrchestratorPort": 3307,           #orch使用的数据库端口
  "MySQLOrchestratorDatabase": "orchestrator",  #连接 orch使用的数据库的 库名
  "MySQLOrchestratorUser": "orchestrator",      #连接 orch使用的数据库的 用户名
  "MySQLOrchestratorPassword": "123123",        #连接 orch使用的数据库的 密码
  "MySQLOrchestratorCredentialsConfigFile": "", #orch使用数据库的密码验证配置文件
  "MySQLOrchestratorSSLPrivateKeyFile": "",     #SSL相关
  "MySQLOrchestratorSSLCertFile": "",
  "MySQLOrchestratorSSLCAFile": "",
  "MySQLOrchestratorSSLSkipVerify": true,       #跳过SSL验证
  "MySQLOrchestratorUseMutualTLS": false,       #是否使用TLS验证
  "MySQLConnectTimeoutSeconds": 1,              #orch连接MySQL超时秒数
  "DefaultInstancePort": 3306,             #mysql 集群实例端口，对外提供服务的实例（尽量做到集群内实例端口一致）
  "DiscoverByShowSlaveHosts": true,        #是否使用show slave hosts自动发现集群
  "InstancePollSeconds": 5,                #使用show  slave hosts 探测间隔秒数
  "UnseenInstanceForgetHours": 240,        #忽略不可见的实例的小时数
  "SnapshotTopologiesIntervalHours": 0,    #快照拓扑调用之间的小时间隔。默认：0（表示禁用）
  "InstanceBulkOperationsWaitTimeoutSeconds": 10,#执行批量操作时，在单个实例上等待的时间
  "HostnameResolveMethod": "default",      #解析主机名，默认使用主机名：default；不解析为none，直接用IP
  "MySQLHostnameResolveMethod": "@@hostname",    #解析主机名，发出select @@hostname或发出select @@report_host（需要配置report_host）；不解析用""，直接用IP。
  "SkipBinlogServerUnresolveCheck": true,  #跳过检查 将未解析的主机名解析为和binlog服务器相同的主机名
  "ExpiryHostnameResolvesMinutes": 60,     #域名检测过期周期（分钟）
  "RejectHostnameResolvePattern": "",      #禁止使用正则表达式表示域名
  "ReasonableReplicationLagSeconds": 100,  #复制延迟高于100S表示异常
  "ProblemIgnoreHostnameFilters": [],      #将主机做正则匹配筛选成最小化
  "VerifyReplicationFilters": false,       #在拓扑重构之前检查复制筛选器
  "ReasonableMaintenanceReplicationLagSeconds": 20, #复制延迟高于该值会上下移动调整MySQL拓扑
  "CandidateInstanceExpireMinutes": 60,    #该时间之后，使用实例作为候选从库（在主故障转移时提升）的建议到期
  "AuditLogFile": "",                      #审计日志，空的时候禁用
  "AuditToSyslog": false,                  #审计日志是否写入到系统日志
  "RemoveTextFromHostnameDisplay": ":3306", #去除集群的文本
  "ReadOnly": false,                #全局只读
  "AuthenticationMethod": "",       #身份验证类型
  "HTTPAuthUser": "",               #http验证用户名
  "HTTPAuthPassword": "",
  "AuthUserHeader": "",
  "PowerAuthUsers": [
    "*"
  ],
  "ClusterNameToAlias": {           #正则表达式匹配集群名称和别名之间的映射
    "127.0.0.1": "test suite"
  },
  "SlaveLagQuery": "",         #使用show slave status 进行延迟判断
  "DetectClusterAliasQuery": "SELECT SUBSTRING_INDEX(@@hostname, '.', 1)",#查询集群别名
  "DetectClusterDomainQuery": "",#可选查询，返回集群主服务器的VIP/别名/域名。
  "DetectInstanceAliasQuery": "",#可选查询，返回实例的别名
  "DetectPromotionRuleQuery": "",#可选查询，返回实例的提升规则
  "DataCenterPattern": "[.]([^.]+)[.][^.]+[.]mydomain[.]com",#从主机名称中提取数据中心名称
  "PhysicalEnvironmentPattern": "[.]([^.]+[.][^.]+)[.]mydomain[.]com", #从主机名中提取物理环境信息
  "PromotionIgnoreHostnameFilters": [],   #不使用主机名匹配模式来提升副本
  "DetectSemiSyncEnforcedQuery": "",      #检测是否强制半同步
  "ServeAgentsHttp": false,               #产生一个专用于orche-client的HTTP端口
  "AgentsServerPort": ":3001",            #可选，对于raft设置，此节点将向其他节点通告HTTP的地址
  "AgentsUseSSL": false,                  #当为true时，orch将使用SSL侦听代理端口已经通过SSL连接的代理
  "AgentsUseMutualTLS": false,            #当为true时，使用TLS服务器与代理通信
  "AgentSSLSkipVerify": false,            #为代理使用SSL
  "AgentSSLPrivateKeyFile": "",           #
  "AgentSSLCertFile": "",
  "AgentSSLCAFile": "",
  "AgentSSLValidOUs": [],
  "UseSSL": false,                       #在服务器WEB端口上使用SSL
  "UseMutualTLS": false,                 #true时使用TLS作为服务器的WEB和API连接                
  "SSLSkipVerify": false,
  "SSLPrivateKeyFile": "",
  "SSLCertFile": "",
  "SSLCAFile": "",
  "SSLValidOUs": [],               #使用TLS交互
  "URLPrefix": "",                 #在非跟web路径上运行orch的URL前缀
  "StatusEndpoint": "/api/status", #状态查看
  "StatusSimpleHealth": true,
  "StatusOUVerify": false,
  "AgentPollMinutes": 60,          #代理之间轮询的分钟数
  "UnseenAgentForgetHours": 6,     #忘记不可见代理的小时数
  "StaleSeedFailMinutes": 60,      #无进展60分钟后被认为失败
  "SeedAcceptableBytesDiff": 8192, #种子源和目标源数据大小的字节差异仍被视为成功复制
  "PseudoGTIDPattern": "",         #为空时，禁用基于伪GTID的重构  
  "PseudoGTIDPatternIsFixedSubstring": false, #如为TRUE，则上个参数不被视为正则表达式而被视为固定子字符串
  "PseudoGTIDMonotonicHint": "asc:",          #Pseudo-GTID条目中的子字符串，表示Pseudo-GTID条目预计会单调递增
  "DetectPseudoGTIDQuery": "",                #可选查询，用于决定是否在实例上启用伪GTID
  "BinlogEventsChunkSize": 10000,             #show binlog events 块的大小。较小意味着更少的锁定和工作要做
  "SkipBinlogEventsContaining": [],           #扫描/比较Pseudo-GTID的binlog 时，跳过包含给定文本的条目。这些不是正则表达式（扫描binlog时会消耗太多的CPU），只需查找子字符串
  "ReduceReplicationAnalysisCount": true,     #如果为true，则复制分析将报告可能首先处理问题的可能性的实例。 如果为false，则为每个已知实例提供一个条目
  "FailureDetectionPeriodBlockMinutes": 60,   #在该时间内再次出现故障，不会被再次发现
  "RecoveryPeriodBlockSeconds": 3600,         #在该时间内再次出现故障，不会进行failover，避免出现并发恢复和不稳定
  "RecoveryIgnoreHostnameFilters": [],        #恢复会忽略的主机
  "RecoverMasterClusterFilters": [            #只对能匹配这些正则表达式模式的集群进行主故障恢复
    "*"
  ],
  "RecoverIntermediateMasterClusterFilters": [ #只对能匹配这些正则表达式模式的集群进行主故障恢复（“*”模式匹配所有）
    "*"
  ],
  "OnFailureDetectionProcesses": [    #检测到主故障时执行的命令和脚本
    "echo 'Detected {failureType} on {failureCluster}. Affected replicas: {countSlaves}' >> /tmp/recovery.log"
  ],
  "PreGracefulTakeoverProcesses": [   #在执行故障转移之前执行的命令和脚本 
    "echo 'Planned takeover about to take place on {failureCluster}. Master will switch to read_only' >> /tmp/recovery.log"
  ],
  "PreFailoverProcesses": [           #执行恢复操作前执行
    "echo 'Will recover from {failureType} on {failureCluster}' >> /tmp/recovery.log"
  ],
  "PostFailoverProcesses": [          #在failover全部成功后执行
    "echo '(for all types) Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostUnsuccessfulFailoverProcesses": [], #在failover失败后执行
  "PostMasterFailoverProcesses": [         #在主恢复成功结束时执行
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Promoted: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostIntermediateMasterFailoverProcesses": [  #在中间主成功恢复结束时执行
    "echo 'Recovered from {failureType} on {failureCluster}. Failed: {failedHost}:{failedPort}; Successor: {successorHost}:{successorPort}' >> /tmp/recovery.log"
  ],
  "PostGracefulTakeoverProcesses": [            #在新主晋升之后执行
    "echo 'Planned takeover complete' >> /tmp/recovery.log"
  ],
  "CoMasterRecoveryMustPromoteOtherCoMaster": true, #当false时，任何实例都可以得到提升；当true时，将提升共同主人，否则失败
  "DetachLostSlavesAfterMasterFailover": true,      #恢复过程中可能会丢失一些副本。如果为true，将通过detach-replica命令强制中断其复制，并认为它们不正常运行。
  "ApplyMySQLPromotionAfterMasterFailover": true,   #在主上执行reset slave all，并设置read_only=0
  "PreventCrossDataCenterMasterFailover": false,    #如果为true（默认值：false），则不允许跨DC主故障转移，orchestrator将尽其所能仅在同一DC内进行故障转移，否则不进行故障转移。
  "MasterFailoverDetachSlaveMasterHost": false,     #是否应该在新升级的master上发出detach-replica-master-host，这样可以确保新master不会尝试复制正常之后的旧的master。如果参数ApplyMySQLPromotionAfterMasterFailover为True，则该参数无意义。
  "MasterFailoverLostInstancesDowntimeMinutes": 0,  #主故障转移后丢失的服务器停机的分钟数（包括失败的主和丢失的从）。0表示禁用
  "PostponeSlaveRecoveryOnLagMinutes": 0,           #在崩溃恢复时，延迟超过给定分钟的从库在主被选出后才复活。 值为0将禁用此功能。
  "OSCIgnoreHostnameFilters": [],
  "GraphiteAddr": "",
  "GraphitePath": "",
  "GraphiteConvertHostnameDotsToUnderscores": true,
  "ConsulAddress": "",
  "ConsulAclToken": ""
  
  "RaftEnabled": true,   #raft模式
  "BackendDB": "mysql",  #后台数据库类型
  "RaftBind": "192.168.192.20",  #绑定之地，本机IP
  "RaftDataDir": "/var/lib/orchestrator",  #数据目录，如果不存在，则自动创建
  "DefaultRaftPort": 10008,  #raft通信端口，所有机器必须保持一致
  "RaftNodes": [   #raft节点，必须包含所有节点
    "192.168.192.20", 
    "192.168.192.21",
    "192.168.192.22"
    ]
}

5.启动

注意：一定要使用非绝对路径进行启动

cd /usr/local/orchestrator
./orchestrator --debug --config=/usr/local/orchestrator/orchestrator.conf.json http   &

6.WEB界面

在这里插入图片描述

参考：
https://www.cnblogs.com/geek-ace/p/9441706.html#commentform
https://www.cnblogs.com/zhoujinyi/p/10387581.html

VIP脚本

参考以下脚本自己写的脚本路径：
https://download.youkuaiyun.com/download/wangxin3618/11421332
https://download.youkuaiyun.com/download/wangxin3618/11421340

参考：https://github.com/theTibi/orchestrator_vip