还是先把无关紧要的函数语句删掉了,只保留了主要部分。
perform_dry_run
/* Perform dry run of all test cases to confirm that the app is working as
expected. This is done only for the initial inputs, and only once. */
static void perform_dry_run(char** argv) {
struct queue_entry* q = queue;
// 校准失败次数
u32 cal_failures = 0;
u8* skip_crashes = getenv("AFL_SKIP_CRASHES");
// 重复执行每个testcase
while (q) {
// use_mem 用于保存testcase的实际内容
u8* use_mem;
u8 res;
s32 fd;
// 获取testdcase对应的文件名称
u8* fn = strrchr(q->fname, '/') + 1;
ACTF("Attempting dry run with '%s'...", fn);
// 打开文件描述符
fd = open(q->fname, O_RDONLY);
if (fd < 0) PFATAL("Unable to open '%s'", q->fname);
// 分配内存,用于存储实际的testcase
use_mem = ck_alloc_nozero(q->len);
// 将testcase读取到use_mem
if (read(fd, use_mem, q->len) != q->len)
FATAL("Short read from '%s'", q->fname);
// 关闭文件描述符
close(fd);
// 这个函数里面会多次执行testcast(3、8次),并调用run_target
res = calibrate_case(argv, q, use_mem, 0, 1);
// 释放内存
ck_free(use_mem);
if (stop_soon) return;
switch (res) {
case FAULT_NONE:
if (q == queue) check_map_coverage();
break;
}
if (q->var_behavior) WARNF("Instrumentation output varies across runs.");
q = q->next;
}
OKF("All test cases processed.");
}
calibrate_case
static u8 calibrate_case(char** argv, struct queue_entry* q, u8* use_mem,
u32 handicap, u8 from_queue) {
// 第一次运行testcase所产生的trace_bits
static u8 first_trace[MAP_SIZE];
u8 fault = 0,
new_bits = 0,
var_detected = 0,
// has_new_bits,表示产生了新的覆盖率
hnb = 0,
first_run = (q->exec_cksum == 0);
// 开始时间,结束时间
u64 start_us, stop_us;
s32 old_sc = stage_cur, old_sm = stage_max;
u32 use_tmout = exec_tmout;
u8* old_sn = stage_name;
// 为什么还没开始执行,就cal_failed++?
q->cal_failed++;
// 根据是否设置了快速执行,重复运行testcase 3/8次,多次运行是有意义的,
// 因为无法保证代码每次执行都是一样的,存在不确定性
stage_name = "calibration";
stage_max = fast_cal ? 3 : CAL_CYCLES;
/* Make sure the forkserver is up before we do anything, and let's not
count its spin-up time toward binary calibration. */
// 如果没有打开forkserver,就启动forkserver进程,这个forkserver会在main函数处停留,并等待来自fuzzer的命令
// https://blog.youkuaiyun.com/weixin_44033321/article/details/136401701
if (dumb_mode != 1 && !no_forkserver && !forksrv_pid)
init_forkserver(argv);
// 这里q->exec_cksum是一定为0的,但是后面fuzz_one会调用这个函数,那个时候就是非0
if (q->exec_cksum) {
memcpy(first_trace, trace_bits, MAP_SIZE);
hnb = has_new_bits(virgin_bits);
if (hnb > new_bits) new_bits = hnb;
}
// 记录开始时间
start_us = get_cur_time_us();
for (stage_cur = 0; stage_cur < stage_max; stage_cur++) {
u32 cksum;
if (!first_run && !(stage_cur % stats_update_freq)) show_stats();
// 这个函数没有仔细研究,不太明白
write_to_testcase(use_mem, q->len);
// 运行函数,并返回运行结果,返回crash,timeout,或者
fault = run_target(argv, use_tmout);
/* stop_soon is set by the handler for Ctrl+C. When it's pressed,
we want to bail out quickly. */
// 计算新的校验和
cksum = hash32(trace_bits, MAP_SIZE, HASH_CONST);
// 比较两次执行的校验和是否发生的变化,如果发生变化,要么是第一次执行,要么是两次trace_bits不同
if (q->exec_cksum != cksum) {
// 这里看virgin_bits和trace_bits是否一致,如果不一致,返回1/2,并更新virgin_bits
// https://blog.youkuaiyun.com/weixin_44033321/article/details/136422289
hnb = has_new_bits(virgin_bits);
// new_bits初始值是0,被更新成1/2
if (hnb > new_bits) new_bits = hnb;
// 判断是不是第一次执行,如果不是
if (q->exec_cksum) {
u32 i;
for (i = 0; i < MAP_SIZE; i++) {
if (!var_bytes[i] && first_trace[i] != trace_bits[i]) {
// 记录变化的bytes
var_bytes[i] = 1;
// 增加执行轮次
stage_max = CAL_CYCLES_LONG;
}
}
var_detected = 1;
} else {
// 第一次执行,更新校验和,更新first_trace
q->exec_cksum = cksum;
memcpy(first_trace, trace_bits, MAP_SIZE);
}
}
}
// 记录结束时间
stop_us = get_cur_time_us();
// 计算总的执行时间和轮次
total_cal_us += stop_us - start_us;
total_cal_cycles += stage_max;
/* OK, let's collect some stats about the performance of this test case.
This is used for fuzzing air time calculations in calculate_score(). */
// 更新数据
q->exec_us = (stop_us - start_us) / stage_max;
q->bitmap_size = count_bytes(trace_bits);
q->handicap = handicap;
q->cal_failed = 0;
total_bitmap_size += q->bitmap_size;
total_bitmap_entries++;
// 这个函数后面再看,大概意思是看看这个testcase是不是更优的,给testcase排个序
update_bitmap_score(q);
/* If this case didn't result in new output from the instrumentation, tell
parent. This is a non-critical problem, but something to warn the user
about. */
if (!dumb_mode && first_run && !fault && !new_bits) fault = FAULT_NOBITS;
abort_calibration:
if (new_bits == 2 && !q->has_new_cov) {
q->has_new_cov = 1;
queued_with_cov++;
}
/* Mark variable paths. */
if (var_detected) {
var_byte_count = count_bytes(var_bytes);
if (!q->var_behavior) {
mark_as_variable(q);
queued_variable++;
}
}
stage_name = old_sn;
stage_cur = old_sc;
stage_max = old_sm;
if (!first_run) show_stats();
return fault;
}
run_target
/* Execute target application, monitoring for timeouts. Return status
information. The called program will update trace_bits[]. */
static u8 run_target(char** argv, u32 timeout) {
static struct itimerval it;
static u32 prev_timed_out = 0;
static u64 exec_ms = 0;
int status = 0;
u32 tb4;
child_timed_out = 0;
/* After this memset, trace_bits[] are effectively volatile, so we
must prevent any earlier operations from venturing into that
territory. */
memset(trace_bits, 0, MAP_SIZE);
MEM_BARRIER();
/* If we're running in "dumb" mode, we can't rely on the fork server
logic compiled into the target program, so we will just keep calling
execve(). There is a bit of code duplication between here and
init_forkserver(), but c'est la vie. */
if (dumb_mode == 1 || no_forkserver) {
} else {
s32 res;
/* In non-dumb mode, we have the fork server up and running, so simply
tell it to have at it, and then read back PID. */
if ((res = write(fsrv_ctl_fd, &prev_timed_out, 4)) != 4) {
if (stop_soon) return 0;
RPFATAL(res, "Unable to request new process from fork server (OOM?)");
}
if ((res = read(fsrv_st_fd, &child_pid, 4)) != 4) {
if (stop_soon) return 0;
RPFATAL(res, "Unable to request new process from fork server (OOM?)");
}
if (child_pid <= 0) FATAL("Fork server is misbehaving (OOM?)");
}
/* Configure timeout, as requested by user, then wait for child to terminate. */
it.it_value.tv_sec = (timeout / 1000);
it.it_value.tv_usec = (timeout % 1000) * 1000;
setitimer(ITIMER_REAL, &it, NULL);
/* The SIGALRM handler simply kills the child_pid and sets child_timed_out. */
if (dumb_mode == 1 || no_forkserver) {
if (waitpid(child_pid, &status, 0) <= 0) PFATAL("waitpid() failed");
} else {
s32 res;
if ((res = read(fsrv_st_fd, &status, 4)) != 4) {
if (stop_soon) return 0;
RPFATAL(res, "Unable to communicate with fork server (OOM?)");
}
}
if (!WIFSTOPPED(status)) child_pid = 0;
getitimer(ITIMER_REAL, &it);
exec_ms = (u64) timeout - (it.it_value.tv_sec * 1000 +
it.it_value.tv_usec / 1000);
it.it_value.tv_sec = 0;
it.it_value.tv_usec = 0;
setitimer(ITIMER_REAL, &it, NULL);
total_execs++;
/* Any subsequent operations on trace_bits must not be moved by the
compiler below this point. Past this location, trace_bits[] behave
very normally and do not have to be treated as volatile. */
MEM_BARRIER();
tb4 = *(u32*)trace_bits;
#ifdef WORD_SIZE_64
classify_counts((u64*)trace_bits);
#else
classify_counts((u32*)trace_bits);
#endif /* ^WORD_SIZE_64 */
prev_timed_out = child_timed_out;
/* Report outcome to caller. */
if (WIFSIGNALED(status) && !stop_soon) {
kill_signal = WTERMSIG(status);
if (child_timed_out && kill_signal == SIGKILL) return FAULT_TMOUT;
return FAULT_CRASH;
}
/* A somewhat nasty hack for MSAN, which doesn't support abort_on_error and
must use a special exit code. */
if (uses_asan && WEXITSTATUS(status) == MSAN_ERROR) {
kill_signal = 0;
return FAULT_CRASH;
}
if ((dumb_mode == 1 || no_forkserver) && tb4 == EXEC_FAIL_SIG)
return FAULT_ERROR;
/* It makes sense to account for the slowest units only if the testcase was run
under the user defined timeout. */
if (!(timeout > exec_tmout) && (slowest_exec_ms < exec_ms)) {
slowest_exec_ms = exec_ms;
}
return FAULT_NONE;
}
这里解释下setitimer之后代码的处理逻辑;
- setitimer首先设置一个计时器,代码继续往下执行
- 从forkserver读取child进程的执行状态(read(fsrv_st_fd, &status, 4))
- 这个时候有2种情况,
- 第一种是child代码顺利执行完毕,给read函数返回一个status
- 第二种是timer到时间了,read还没有读到任何信息,还在阻塞,那么timer就会发送一个SIGALARM信号,前面设置的信号处理函数setup_signal_handler会调用相应处理函数进行处理(具体来说,调用kill把child进程杀死)。read结束阻塞,还是读取到信号。
- 调用WIFSIGNALED(status)查看程序是不是正常退出,如果不是,调用WTERMSIG(status)查看具体状态。
classify_counts函数
static inline void classify_counts(u64* mem) {
u32 i = MAP_SIZE >> 3;
while (i--) {
/* Optimize for sparse bitmaps. */
if (unlikely(*mem)) {
u16* mem16 = (u16*)mem;
mem16[0] = count_class_lookup16[mem16[0]];
mem16[1] = count_class_lookup16[mem16[1]];
mem16[2] = count_class_lookup16[mem16[2]];
mem16[3] = count_class_lookup16[mem16[3]];
}
mem++;
}
}
count_class_lookup16
/* 这段代码简单来说,就是把16位的前8位用count_class_lookup8转换1次,
把16位的后8位用count_class_lookup8转换1次。
从结果上来看,和按8位进行处理的结果是一样的*/
EXP_ST void init_count_class16(void) {
u32 b1, b2;
for (b1 = 0; b1 < 256; b1++)
for (b2 = 0; b2 < 256; b2++)
count_class_lookup16[(b1 << 8) + b2] =
(count_class_lookup8[b1] << 8) |
count_class_lookup8[b2];
}
count_class_lookup8
static const u8 count_class_lookup8[256] = {
[0] = 0, // 00000000
[1] = 1, // 00000001
[2] = 2, // 00000010
[3] = 4, // 00000100
[4 ... 7] = 8, // 00001000
[8 ... 15] = 16, // 00010000
[16 ... 31] = 32, // 00100000
[32 ... 127] = 64, // 01000000
[128 ... 255] = 128 // 10000000
};
has_new_bits函数
/* Check if the current execution path brings anything new to the table.
Update virgin bits to reflect the finds. Returns 1 if the only change is
the hit-count for a particular tuple; 2 if there are new tuples seen.
Updates the map, so subsequent calls will always return 0.
This function is called after every exec() on a fairly large buffer, so
it needs to be fast. We do this in 32-bit and 64-bit flavors. */
static inline u8 has_new_bits(u8* virgin_map) {
u64* current = (u64*)trace_bits;
u64* virgin = (u64*)virgin_map;
u32 i = (MAP_SIZE >> 3);
u8 ret = 0;
while (i--) {
/* Optimize for (*current & *virgin) == 0 - i.e., no bits in current bitmap
that have not been already cleared from the virgin map - since this will
almost always be the case. */
if (unlikely(*current) && unlikely(*current & *virgin)) {
if (likely(ret < 2)) {
u8* cur = (u8*)current;
u8* vir = (u8*)virgin;
/* Looks like we have not found any new bytes yet; see if any non-zero
bytes in current[] are pristine in virgin[]. */
if ((cur[0] && vir[0] == 0xff) || (cur[1] && vir[1] == 0xff) ||
(cur[2] && vir[2] == 0xff) || (cur[3] && vir[3] == 0xff) ||
(cur[4] && vir[4] == 0xff) || (cur[5] && vir[5] == 0xff) ||
(cur[6] && vir[6] == 0xff) || (cur[7] && vir[7] == 0xff)) ret = 2;
else ret = 1;
}
*virgin &= ~*current;
}
current++;
virgin++;
}
if (ret && virgin_map == virgin_bits) bitmap_changed = 1;
return ret;
}
前面的classify代码把tracebits进行了简化,举个例子,0001xxxx,会被简化为00010000,也就是只保留最高位的1。
个人感觉这种处理能够简化计算:cur[0] && vir[0] == 0xff,由于&&有短路效果,cur[0]为0,vir[0] == 0xff不再执行。
而且能够明确的反应执行次数的改变,举个例子,如果第一次执行了11111111次,第二次执行了00000001次,其实这2者的执行次数存在巨大变换,但是如果不进行归一化,11111111会把virgin的所有bits清0,00000001会被未发生变化。
11111111,和10000000,虽然差了上百,但是实际执行次数只差了1倍,而00000011和00000010虽然只差了1,但是执行次数也查了1倍,感觉这里的处理还是挺巧妙地。