zabbix源码之zabbix alerter.c报警逻辑

最新推荐文章于 2025-11-08 10:10:05 发布

转载最新推荐文章于 2025-11-08 10:10:05 发布 · 2.1k 阅读

文章标签：

#zabbix #源码

zabbix 专栏收录该内容

1 篇文章

订阅专栏

本文深入剖析Zabbix告警机制，从execute_action函数出发，详细介绍不同媒介类型的告警发送流程，包括邮件、Jabber消息、短信及外部脚本执行等。同时，探讨了zabbix_server如何调度告警发送任务，并解释了告警状态更新的逻辑。

首先需要介绍的时execute_action 函数，这个函数是对于action相关联的mediatype，已经向db反馈成功还是失败的状态。

那我们根据在当初配置的media，做不同的动作。比如是MEDIA_TYPE_EMAIL的话，那就连接stmp，发送邮件。如果是EXEC的话，那就是fork进程，也就是第三方的系统调用。

Python

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
 
	if (MEDIA_TYPE_EMAIL == mediatype->type)
	{
		alarm(ALARM_ACTION_TIMEOUT);
		res = send_email(mediatype->smtp_server, mediatype->smtp_helo, mediatype->smtp_email,
				alert->sendto, alert->subject, alert->message, error, max_error_len);
		alarm(0);
	}
#ifdef HAVE_JABBER
	else if (MEDIA_TYPE_JABBER == mediatype->type)
	{
		/* Jabber uses its own timeouts */
		res = send_jabber(mediatype->username, mediatype->passwd,
				alert->sendto, alert->subject, alert->message, error, max_error_len);
	}
#endif
	else if (MEDIA_TYPE_SMS == mediatype->type)
	{
		/* SMS uses its own timeouts */
		res = send_sms(mediatype->gsm_modem, alert->sendto, alert->message, error, max_error_len);
	}
	else if (MEDIA_TYPE_EZ_TEXTING == mediatype->type)
	{
		/* Ez Texting uses its own timeouts */
		res = send_ez_texting(mediatype->username, mediatype->passwd,
				alert->sendto, alert->message, mediatype->exec_path, error, max_error_len);
	}
	else if (MEDIA_TYPE_EXEC == mediatype->type)

这下面是zabbix里面具体调用scripts脚本的过程。

Python


1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

 

if
(0
==
access(cmd,
X_OK))

{

send_to
=
zbx_dyn_escape_string(alert->sendto,
"\"\\");

subject
=
zbx_dyn_escape_string(alert->subject,
"\"\\");

message
=
zbx_dyn_escape_string(alert->message,
"\"\\");

 

zbx_snprintf_alloc(&cmd,
&cmd_alloc,
&cmd_offset,
" \"%s\" \"%s\" \"%s\"",

send_to,
subject,
message);

 

zbx_free(send_to);

zbx_free(subject);

zbx_free(message);

 

if
(SUCCEED
==
(res
=
zbx_execute(cmd,
&output,
error,
max_error_len,
ALARM_ACTION_TIMEOUT)))

{

zabbix_log(LOG_LEVEL_DEBUG,
"%s output:\n%s",
mediatype->exec_path,
output);

zbx_free(output);

}

else

res
=
FAIL;

}

else

zbx_snprintf(error,
max_error_len,
"%s: %s",
cmd,
zbx_strerror(errno));

 

zbx_free(cmd);

上面的是关于触发action的相关函数，那肯定还有个在一直调用execute_action函数的主main函数吧。他的函数名字是 main_alerter_loop 。既然是loop，那就知道他是做啥的了，逻辑很简单，zabbix_server 启动后，fork出main_alerter_loop函数来，让他独立负责报警这件事情。

关于zabbix日志记录逻辑:

Python

1
2
3
 
	zabbix_log(LOG_LEVEL_INFORMATION, "%s #%d started [%s #%d]", get_daemon_type_string(daemon_type),
			server_num, get_process_type_string(process_type), process_num);

创建一个DB连接的对象

Python

1

2

DBconnect(ZBX_DB_CONNECT_NORMAL);

通过Mysql查询alerts未发送的任务，通过media查到行为的方式。

Python

1
2
3
4
5
6
7
8
9
10
11
12
 
		result = DBselect(
				"select a.alertid,a.mediatypeid,a.sendto,a.subject,a.message,a.status,mt.mediatypeid,"
				"mt.type,mt.description,mt.smtp_server,mt.smtp_helo,mt.smtp_email,mt.exec_path,"
				"mt.gsm_modem,mt.username,mt.passwd,a.retries"
				" from alerts a,media_type mt"
				" where a.mediatypeid=mt.mediatypeid"
					" and a.status=%d"
					" and a.alerttype=%d"
				" order by a.alertid",
				ALERT_STATUS_NOT_SENT,
				ALERT_TYPE_MESSAGE);

他是一次性的把没有发送，也就是未执行的报警任务，都给取出来，然后传递给execute_action去处理报警的逻辑。

Python


1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

 

ZBX_STR2UINT64(alert.alertid,
row[0]);

ZBX_STR2UINT64(alert.mediatypeid,
row[1]);

alert.sendto
=
row[2];

alert.subject
=
row[3];

alert.message
=
row[4];

alert.status
=
atoi(row[5]);

 

ZBX_STR2UINT64(mediatype.mediatypeid,
row[6]);

mediatype.type
=
atoi(row[7]);

mediatype.description
=
row[8];

mediatype.smtp_server
=
row[9];

mediatype.smtp_helo
=
row[10];

mediatype.smtp_email
=
row[11];

mediatype.exec_path
=
row[12];

mediatype.gsm_modem
=
row[13];

mediatype.username
=
row[14];

mediatype.passwd
=
row[15];

 

alert.retries
=
atoi(row[16]);

 

*error
=
'\0';

res
=
execute_action(&alert,
&mediatype,
error,
sizeof(error));

虽然有不同的触发动作，但是返回值的状态都一样。下面的逻辑，是判断返回状态，入库或者是debug日志中。

Python

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
 
			if (SUCCEED == res)
			{
				zabbix_log(LOG_LEVEL_DEBUG, "alert ID [" ZBX_FS_UI64 "] was sent successfully",
						alert.alertid);
				DBexecute("update alerts set status=%d,error='' where alertid=" ZBX_FS_UI64,
						ALERT_STATUS_SENT, alert.alertid);
				alerts_success++;
			}
			else
			{
				zabbix_log(LOG_LEVEL_DEBUG, "error sending alert ID [" ZBX_FS_UI64 "]", alert.alertid);
 
				error_esc = DBdyn_escape_string_len(error, ALERT_ERROR_LEN);
 
				alert.retries++;
 
				if (ALERT_MAX_RETRIES > alert.retries)
				{
					DBexecute("update alerts set retries=%d,error='%s' where alertid=" ZBX_FS_UI64,
							alert.retries, error_esc, alert.alertid);
				}
				else
				{
					DBexecute("update alerts set status=%d,retries=%d,error='%s' where alertid=" ZBX_FS_UI64,
							ALERT_STATUS_FAILED, alert.retries, error_esc, alert.alertid);
				}
 
				zbx_free(error_esc);
 
				alerts_fail++;
			}
 
		}

最后的几段话意思是，统计时间及sleep 30秒后，再继续下一轮。

Python

1

2

3

4

5

6

7

8

sec
=
zbx_time()
-
sec;

zbx_setproctitle("%s
 [sent alerts: %d success, %d fail in "
ZBX_FS_DBL
" sec, idle %d sec]",

get_process_type_string(process_type),
alerts_success,
alerts_fail,
sec,

CONFIG_SENDER_FREQUENCY);

zbx_sleep_loop(CONFIG_SENDER_FREQUENCY);

通过server.c确定zabbix 每次alert间隔的时间了。

Python



1

2

3

4

 

[xiaorui@devops
zabbix-2.4.2
]$
grep  'CONFIG_SENDER_FREQUENCY'  src/zabbix_server/server.c

int
CONFIG_SENDER_FREQUENCY
=
30;

[xiaorui@devops
zabbix-2.4.2
]$