package task
import (
"context"
"fmt"
v1 "gc-sika-deer/internal/grpcpb/v1"
"gc-sika-deer/internal/server/conf"
"gc-sika-deer/internal/server/dao"
"gc-sika-deer/pkg/common"
"gc-sika-deer/pkg/tools"
"gc-sika-deer/pkg/util"
"strings"
"sync"
"time"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
)
type MySQLInstDiscoverTask struct {
logEntry *logrus.Entry
interval int //second
}
func (*MySQLInstDiscoverTask) Name() string {
return "MySQL Instance Discover Task"
}
func (task *MySQLInstDiscoverTask) SecondInterval() int {
return task.interval
}
func NewMySQLInstDiscoverTask(interval int) SikaTask {
task := &MySQLInstDiscoverTask{
interval: interval,
}
task.logEntry = logrus.WithField("Prefix", task.Name())
return task
}
func (task *MySQLInstDiscoverTask) Execute() {
hostSelector := dao.GetHostSelector()
onlineHosts, err := dao.QueryHostInfoByAgentStatus(common.AgentStatusRunning, hostSelector)
if err != nil {
task.logEntry.Errorf("QueryHostInfoByAgentStatus: %v, agent_status=%d", err, common.AgentStatusRunning)
return
}
allHosts, err := dao.QueryHostInfoBySelector(hostSelector)
if err != nil {
task.logEntry.Errorf("QueryAllHostInfo:%v", err)
return
}
allHostMap := task.makeHostMap(allHosts)
instInDB, err := dao.QueryNotOfflineMySQLInstance(hostSelector)
if err != nil {
task.logEntry.Errorf("QueryNotOfflineMySQLInstance:%v", err)
return
}
mu := sync.Mutex{}
// agent发现的所有mysql实例
var aInstInAgent []*dao.MySQLInstance
neighboursMap := map[string]*dao.MySQLInstance{}
fn := func(host *dao.HostInfo) {
instInAgent, relatedInsts, err := getAllMySQLInstance(host)
if err != nil {
task.logEntry.Debugf("获取机器(host=%s)上的MySQL实例出错:%v", host.HostIP, err)
}
mu.Lock()
aInstInAgent = append(aInstInAgent, instInAgent...)
task.tryUpdNeighbourMap(allHostMap, relatedInsts, neighboursMap)
mu.Unlock()
}
// 获取每台主机上的所有的MySQL实例
util.WithConcurrencyRun(fn, concurrencyLimit, onlineHosts)
// map的key都是source_hostIP_port;instDBMap存数据库中现有实例的信息,instAgentMap存Agent发现的实例信息
var instDBMap, instAgentMap, neighbourDBMap map[string]*dao.MySQLInstance
// 构造map数据结构
instDBMap, neighbourDBMap, instAgentMap = task.makeInstMap(instInDB, aInstInAgent)
// 算出数据库中存的和agent采集的MySQL实例的差异
newInsts, oldInsts, offInsts := task.getDiffInst(instAgentMap, instDBMap)
// instAgentMap中的inst补充instanceId
task.completeInstanceId(instDBMap, instAgentMap)
// instAgentMap中的inst补充masterInstanceId
task.completeMasterInstanceId(instAgentMap)
// instAgentMap中的inst补充clusterId
task.completeClusterId(instAgentMap, instDBMap)
// 更新数据库
task.updateInstance(newInsts, oldInsts, offInsts, instDBMap, instAgentMap)
// 主机不在平台内的邻居neighbors
newNeighs, oldNeighs, offNeighs := task.getDiffNeighInst(neighboursMap, neighbourDBMap, instDBMap)
// 处理一阶邻居
task.updateInstanceNeighbour(newNeighs, oldNeighs, offNeighs, neighboursMap)
}
func (task *MySQLInstDiscoverTask) completeClusterId(instAgentMap, instDBMap map[string]*dao.MySQLInstance) {
// 先补充master的
for key, inst := range instAgentMap {
if inst.Role == common.MySQLInstanceRoleSlave {
continue
}
if instInDB, ok := instDBMap[key]; ok {
inst.ClusterId = task.tryGetClusterId(instInDB, inst)
} else { // 新发现的主实例,这里生成clusterIdd
task.logEntry.Infof("发现新的主库:[%s]", key)
inst.ClusterId = uuid.NewString()
}
}
// 补充slave的
for key, inst := range instAgentMap {
if inst.Role == common.MySQLInstanceRoleMaster {
continue
}
if inst.MasterInstanceIP == "" && inst.MasterInstancePort == 0 {
// 从实例,并且它的主实例不在平台内,跳过
continue
}
mip := inst.MasterInstanceIP
mport := inst.MasterInstancePort
units := strings.Split(key, "_")
if len(units) != 3 {
task.logEntry.Errorf("completeClusterId时,instAgentMap中的key格式不正确,key=%s", key)
continue
}
mkey := fmt.Sprintf("%s_%s_%d", units[0], mip, mport)
if val, ok := instAgentMap[mkey]; ok {
inst.ClusterId = val.ClusterId // 从实例,集群id来自其主实例
}
}
}
func (task *MySQLInstDiscoverTask) tryGetClusterId(instInDB, instInAgent *dao.MySQLInstance) string {
if instInDB.ClusterId != "" {
return instInDB.ClusterId
}
task.logEntry.Infof("给已经存在的主实例(%s)赋值cluster_id时,cluster_id为空,需要从mysql_cluster表中匹配出cluster_id", instInDB.GetUniqueKey())
// 找到新发现实例的相关联实例
// 根据相关联实例从mysql_cluster表中查出cluster_id
conn, err := tools.OpenShortDB(instInAgent.IP, instInAgent.Port, conf.ServerConfig.MySQLAccount.User, conf.ServerConfig.MySQLAccount.Password, "")
if err != nil {
task.logEntry.Errorf("实例找回cluster_id失败:查询相关联实例(%s)时,建立连接失败:%v", instInAgent.GetUniqueKey(), err)
return ""
}
defer conn.Close()
// 获取相关联实例
thisNode := &common.MySQLNode{IP: instInAgent.IP, Port: instInAgent.Port}
relatedNodes, err := common.QueryMySQLInstanceRelatedNode(conn, thisNode)
if err != nil {
task.logEntry.Errorf("获取相关MySQLNode失败:%v, thisNode=%+v", err, thisNode)
return ""
}
allNodes := append(relatedNodes, thisNode)
var clusterInfo *dao.MySQLCluster
for _, node := range allNodes {
clusterInfos, err := dao.QueryClusterInfoByUniqueKey(instInAgent.Source, node.IP, node.Port)
if err != nil {
task.logEntry.Warningf("QueryClusterInfoByUniqueKey出错,source=%d,node=%+v;err=%v", instInAgent.Source, node, err)
continue
}
if len(clusterInfos) == 1 {
clusterInfo = clusterInfos[0]
break
}
}
if clusterInfo != nil && clusterInfo.ClusterId != "" {
task.logEntry.Infof("实例成功找到cluster_id:instInAgent=%v,clusterId=%s", instInAgent, clusterInfo.ClusterId)
return clusterInfo.ClusterId
}
clusterId := uuid.NewString()
task.logEntry.Errorf("实例没能找回cluster_id,重新生成了一个clusterId=%s", clusterId)
return clusterId
}
func (task *MySQLInstDiscoverTask) completeMasterInstanceId(instAgentMap map[string]*dao.MySQLInstance) {
for key, inst := range instAgentMap {
if inst.Role == common.MySQLInstanceRoleMaster {
continue
}
MasterInstanceIP := inst.MasterInstanceIP
masterInstancePort := inst.MasterInstancePort
units := strings.Split(key, "_")
if len(units) != 3 {
task.logEntry.Errorf("completeMasterInstanceId时,instAgentMap中的key格式不正确,key=%s", key)
continue
}
mkey := fmt.Sprintf("%s_%s_%d", units[0], MasterInstanceIP, masterInstancePort)
if val, ok := instAgentMap[mkey]; ok { // 找到主实例的instanceId
inst.MasterInstanceId = val.InstanceId
}
}
}
func (*MySQLInstDiscoverTask) completeInstanceId(instDBMap, instAgentMap map[string]*dao.MySQLInstance) {
for key, inst := range instAgentMap {
if val, ok := instDBMap[key]; ok {
inst.InstanceId = val.InstanceId
} else {
inst.InstanceId = uuid.NewString() // 新发现的实例,这里产生实例id
}
}
}
func (task *MySQLInstDiscoverTask) makeInstMap(instInDB, aInstInAgent []*dao.MySQLInstance) (instDBMap, neighbourDBMap, instAgentMap map[string]*dao.MySQLInstance) {
instDBMap = make(map[string]*dao.MySQLInstance)
neighbourDBMap = make(map[string]*dao.MySQLInstance) // 库中已存在的邻居节点
for _, inst := range instInDB {
key := task.generatedMapKey(inst)
if inst.FoundType == common.MySQLInstanceFoundTypeNeighbour || inst.FoundType == common.MySQLInstanceFoundTypeUnknown {
neighbourDBMap[key] = inst
} else {
instDBMap[key] = inst
}
}
instAgentMap = make(map[string]*dao.MySQLInstance)
for _, inst := range aInstInAgent {
key := task.generatedMapKey(inst)
instAgentMap[key] = inst
}
return
}
func (task *MySQLInstDiscoverTask) makeHostMap(allHosts []*dao.HostInfo) (allHostMap map[string]*dao.HostInfo) {
allHostMap = map[string]*dao.HostInfo{}
for _, host := range allHosts {
key := fmt.Sprintf("%d_%s", host.Source, host.HostIP)
allHostMap[key] = host
}
return
}
func (*MySQLInstDiscoverTask) getDiffInst(instAgentMap, instDBMap map[string]*dao.MySQLInstance) (newInsts, oldInsts, offInsts []*dao.MySQLInstance) {
newInsts = []*dao.MySQLInstance{}
oldInsts = []*dao.MySQLInstance{} // oldInsts存的数据更全,是从数据库中查出来的
offInsts = []*dao.MySQLInstance{}
for key, instAgent := range instAgentMap {
if instDB, ok := instDBMap[key]; ok {
oldInsts = append(oldInsts, instDB)
} else {
newInsts = append(newInsts, instAgent)
}
}
for key, instDB := range instDBMap {
if _, ok := instAgentMap[key]; !ok {
offInsts = append(offInsts, instDB)
}
}
return newInsts, oldInsts, offInsts
}
func (*MySQLInstDiscoverTask) getDiffNeighInst(neighAgentMap, neighDBMap, instDBMap map[string]*dao.MySQLInstance) (newInsts, oldInsts, offInsts []*dao.MySQLInstance) {
newInsts = []*dao.MySQLInstance{}
oldInsts = []*dao.MySQLInstance{} // oldInsts存的数据更全,是从数据库中查出来的
offInsts = []*dao.MySQLInstance{}
for key, instAgent := range neighAgentMap {
if instDB, ok := neighDBMap[key]; ok { // 老邻居
oldInsts = append(oldInsts, instDB)
} else {
if instDB, ok := instDBMap[key]; ok {
oldInsts = append(oldInsts, instDB) // 之前的实例变成了邻居
} else {
newInsts = append(newInsts, instAgent) // 新邻居
}
}
}
for key, instDB := range neighDBMap {
if _, ok := neighAgentMap[key]; !ok {
offInsts = append(offInsts, instDB)
}
}
return newInsts, oldInsts, offInsts
}
// 找到主机没录入到平台内的一阶邻居,主机没有agent,我们认为是平台之外的机器
func (task *MySQLInstDiscoverTask) tryUpdNeighbourMap(allHostMap map[string]*dao.HostInfo, relatedInsts []*dao.MySQLInstance, neighbourMap map[string]*dao.MySQLInstance) {
for _, related := range relatedInsts {
hostKey := fmt.Sprintf("%d_%s", related.Source, related.IP)
if _, ok := allHostMap[hostKey]; ok {
continue
}
key := task.generatedMapKey(related)
related.TakeoverState = common.MySQLInstanceTakeoverStateUntaked
neighbourMap[key] = related
}
}
// UpdateMySQLInstance 根据现有MySQL实例信息、新发现的MySQL实例可访问信息、不在平台内的邻居实例来维护拓扑信息
func (task *MySQLInstDiscoverTask) updateInstance(newInsts, oldInsts, offInsts []*dao.MySQLInstance, instDBMap, instAgentMap map[string]*dao.MySQLInstance) {
// 更新oldInsts
for _, inst := range oldInsts {
key := task.generatedMapKey(inst)
instInAgent := instAgentMap[key]
if task.tryUpdInstance(inst, instInAgent) {
latestInDB, err := dao.QueryMySQLInstanceById(inst.ID)
if err != nil {
task.logEntry.Errorf("更新实例信息失败,查询最新的MySQLInstance(ID=%d)信息出错:%v", inst.ID, err)
continue
}
if inst.UpdateTime.Before(latestInDB.UpdateTime) { // 在做集群拓扑的过程中,实例信息有更新(主要考虑role可能更新)
if inst.Role != latestInDB.Role {
task.logEntry.Infof("在做集群拓扑的过程中,实例的role字段有更新:【%v】==>[%v]", inst.Role, latestInDB.Role)
inst.Role = latestInDB.Role // 角色以latestInDB为准
}
}
// 更新实例信息到数据库中
if err = dao.UpdateMySQLInstance(inst); err != nil {
task.logEntry.Errorf("更新实例信息(%+v)到数据库中出错:%v", inst, err)
} else {
task.logEntry.Infof("更新实例信息(%+s)到数据库成功", inst.ClusterId)
}
}
}
// 更新offInsts
user := conf.ServerConfig.MySQLAccount.User
password := conf.ServerConfig.MySQLAccount.Password
for _, inst := range offInsts {
// 记录原始状态
instOriginStatus := inst.Status
serverAccess := common.MySQLAccessType(tools.GetMySQLAccessType(user, password, inst.IP, inst.Port))
if serverAccess <= common.MySQLAccessDenied {
inst.Status = common.MySQLInstanceStatusAbnormal
} else {
// agent挂掉,但实例还活着
inst.Status = common.MySQLInstanceStatusRunning
}
key := task.generatedMapKey(inst)
if val, ok := instDBMap[key]; ok {
if val.Status != instOriginStatus { // 实例的状态有更新:agent挂掉,实例活着的情况
if err := dao.UpdateMySQLInstance(inst); err != nil {
task.logEntry.Errorf("更新实例(%+v)状态为运行异常出错:%v", inst, err)
} else {
task.logEntry.Infof("更新实例(%+v)状态为运行异常成功", inst)
}
}
}
}
// 插入新发现的实例
if len(newInsts) > 0 {
if err := dao.InsertIgnoreMySQLInstances(newInsts); err != nil {
task.logEntry.Errorf("插入新发现的实例(%+v)出错:%v", newInsts, err)
} else {
task.logEntry.Infof("插入新发现的实例(%+v)成功", newInsts)
}
}
}
func (task *MySQLInstDiscoverTask) generatedMapKey(inst *dao.MySQLInstance) string {
return fmt.Sprintf("%d_%s_%d", inst.Source, inst.IP, inst.Port)
}
// tryUpdMySQLInstance 如果s!=t则更新s关键字段和t一样,并返回true;否则返回false
func (task *MySQLInstDiscoverTask) tryUpdInstance(s, t *dao.MySQLInstance) bool {
isUpd := false
var msg string
// TODO
// 目前没有主从切换逻辑,所以当关闭主节点之后,从节点t是agent发现的实例,没有cluster_id,下边逻辑会把从节点的cluster_id置空,先加个非空判断
if t.ClusterId != "" {
if s.ClusterId != t.ClusterId {
msg += fmt.Sprintf("cluster_id:[%s]==>[%s]\t", s.ClusterId, t.ClusterId)
s.ClusterId = t.ClusterId
isUpd = true
}
}
if s.MasterInstanceId != t.MasterInstanceId {
msg += fmt.Sprintf("master_instance_id:[%s]==>[%s]\t", s.MasterInstanceId, t.MasterInstanceId)
s.MasterInstanceId = t.MasterInstanceId
isUpd = true
}
if s.Role != t.Role {
msg += fmt.Sprintf("role:[%d]==>[%d]", s.Role, t.Role)
s.Role = t.Role
isUpd = true
}
if s.ClusterArch != t.ClusterArch {
msg += fmt.Sprintf("cluster_arch:[%d]==>[%d]", s.ClusterArch, t.ClusterArch)
s.ClusterArch = t.ClusterArch
isUpd = true
}
if s.LatencySeconds != t.LatencySeconds {
msg += fmt.Sprintf("latency_seconds:[%d]==>[%d]", s.LatencySeconds, t.LatencySeconds)
s.LatencySeconds = t.LatencySeconds
isUpd = true
}
if s.Status != t.Status {
msg += fmt.Sprintf("status:[%d]==>[%d]", s.Status, t.Status)
s.Status = t.Status
isUpd = true
}
if s.Status2 != t.Status2 {
msg += fmt.Sprintf("status2:[%d]==>[%d]", s.Status2, t.Status2)
s.Status2 = t.Status2
isUpd = true
}
if s.TakeoverState != t.TakeoverState {
msg += fmt.Sprintf("takeover_state:[%d]==>[%d]", s.TakeoverState, t.TakeoverState)
s.TakeoverState = t.TakeoverState
isUpd = true
}
if s.DbVersion != t.DbVersion {
msg += fmt.Sprintf("db_version:[%s]==>[%s]", s.DbVersion, t.DbVersion)
s.DbVersion = t.DbVersion
isUpd = true
}
if s.FoundType != t.FoundType {
msg += fmt.Sprintf("found_type:[%v]==>[%d]", s.FoundType, t.FoundType)
s.FoundType = t.FoundType
isUpd = true
}
if isUpd {
task.logEntry.Infof("实例信息(instance_id=%s)有更新:%s", t.InstanceId, msg)
}
if s.ClusterId != t.ClusterId && s.SolidState == common.MySQLInstanceSolidSolid { // 已经固化的实例,clusterId不更新,实例的status2需要更新成:实例脱离原集群
s.Status = common.MySQLInstanceStatusRunLimit
s.Status2 = common.MySQLInstanceStatus2NodeLeftCluster
} else {
s.Status = t.Status
s.Status2 = t.Status2
}
return isUpd
}
func (task *MySQLInstDiscoverTask) updateInstanceNeighbour(newNeighs, oldNeighs, offNeighs []*dao.MySQLInstance, neighboursMap map[string]*dao.MySQLInstance) {
// 删除offNeighs
for _, off := range offNeighs {
if err := dao.DeleteMySQLInstance(off.ID); err != nil {
task.logEntry.Errorf("删除掉线邻居(%+v)出错:%s", off, err)
} else {
task.logEntry.Infof("删除掉线邻居(%+v)成功", off)
}
}
// 插入 newNeighs
if len(newNeighs) > 0 {
for _, inst := range newNeighs {
inst.InstanceId = uuid.NewString()
}
if err := dao.InsertIgnoreMySQLInstances(newNeighs); err != nil {
task.logEntry.Errorf("批量插入新邻居(%+v)出错:%v", newNeighs, err)
} else {
task.logEntry.Infof("批量插入新邻居(%+v)成功", newNeighs)
}
}
// 更新oldNeighs
for _, old := range oldNeighs {
key := task.generatedMapKey(old)
if val, ok := neighboursMap[key]; ok {
if task.tryUpdNeighbour(old, val) {
if err := dao.UpdateMySQLInstance(old); err != nil {
task.logEntry.Errorf("更新邻居(%s)出错:%v", old.InstanceId, err)
} else {
task.logEntry.Infof("更新邻居(%s)成功", old.InstanceId)
}
}
}
}
}
// tryUpdMySQLNeighbour 如果s!=t则更新s关键字段和t一样,并返回true;否则返回false
func (task *MySQLInstDiscoverTask) tryUpdNeighbour(s, t *dao.MySQLInstance) bool {
isUpd := false
var msg string
if s.MasterInstanceId != t.MasterInstanceId {
msg += fmt.Sprintf("master_instance_id:[%s]==>[%s]\t", s.MasterInstanceId, t.MasterInstanceId)
s.MasterInstanceId = t.MasterInstanceId
isUpd = true
}
if s.Role != t.Role {
msg += fmt.Sprintf("role:[%d]==>[%d]", s.Role, t.Role)
s.Role = t.Role
isUpd = true
}
if s.ClusterArch != t.ClusterArch {
msg += fmt.Sprintf("cluster_arch:[%d]==>[%d]", s.ClusterArch, t.ClusterArch)
s.ClusterArch = t.ClusterArch
isUpd = true
}
if s.Status != t.Status {
msg += fmt.Sprintf("status:[%d]==>[%d]", s.Status, t.Status)
s.Status = t.Status
isUpd = true
}
if s.DbVersion != t.DbVersion {
msg += fmt.Sprintf("db_version:[%s]==>[%s]", s.DbVersion, t.DbVersion)
s.DbVersion = t.DbVersion
isUpd = true
}
if s.FoundType != t.FoundType {
msg += fmt.Sprintf("found_type:[%v]==>[%d]", s.FoundType, t.FoundType)
s.FoundType = t.FoundType
isUpd = true
}
if isUpd {
task.logEntry.Infof("一阶邻居信息(instance_id=%s)有更新:%s", s.InstanceId, msg)
}
return isUpd
}
func getAllMySQLInstance(hi *dao.HostInfo) (mai, relatedInsts []*dao.MySQLInstance, err error) {
conn, err := getAgentConn(hi.HostIP)
if err != nil {
return nil, nil, fmt.Errorf("连接agent失败%v", err)
}
defer conn.Close()
client := v1.NewSikaAgentClient(conn)
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
defer cancel()
ai, err := client.GetAllMySQLInstanceInfo(ctx, &v1.MySQLInstanceInfoReq{HostIP: hi.HostIP})
if err != nil {
return nil, nil, err
}
// server端的统一账号对MySQL实例的访问类型判定,类型有:拒绝访问,只读访问,读写访问
user := conf.ServerConfig.MySQLAccount.User
password := conf.ServerConfig.MySQLAccount.Password
for _, info := range ai.Infos {
var takeover common.MySQLInstanceTakeoverState
sat := tools.GetMySQLAccessType(user, password, hi.HostIP, info.Port)
if info.AccessType == v1.MySQLAccessType_Write && sat >= tools.MySQLAccessTypeRead {
takeover = common.MySQLInstanceTakeoverStateTaked
} else {
takeover = common.MySQLInstanceTakeoverStateUntaked
}
mai = append(mai, &dao.MySQLInstance{
HostId: hi.HostId,
Source: hi.Source,
IP: hi.HostIP,
Port: info.Port,
// MasterInstanceId: masterInstanceId, // masterInstanceId 不能在这里算,每次都是查库,影响性能
MasterInstanceIP: info.MasterInstanceIP,
MasterInstancePort: info.MasterInstancePort,
Role: common.MySQLInstanceRole(info.Role),
ClusterArch: common.MySQLClusterArch(info.Arch),
LatencySeconds: int(info.LatencySeconds),
TakeoverState: takeover,
DbVersion: info.DbVersion,
FoundType: common.MySQLInstanceFoundTypeAgent,
Status: common.MySQLInstanceStatusRunning, //被Agent探测到的实例都是活着的,实例脱离原集群
})
}
for _, related := range ai.RelatedInsts {
relatedInsts = append(relatedInsts, &dao.MySQLInstance{
MasterInstanceIP: related.MasterInstanceIP,
MasterInstancePort: related.MasterInstancePort,
Source: hi.Source,
IP: related.Ip,
Port: related.Port,
Role: common.MySQLInstanceRole(related.Role),
ClusterArch: common.MySQLClusterArch(related.Arch),
Status: common.MySQLInstanceStatus(common.MySQLClusterStatusRunning),
})
}
return mai, relatedInsts, nil
}
帮我解释下这部分代码
最新发布