Android DNS之惩罚机制

最新推荐文章于 2022-06-12 22:43:46 发布

fanxiaoyu321

最新推荐文章于 2022-06-12 22:43:46 发布

阅读量1.6k

点赞数 1

CC 4.0 BY-SA版权

分类专栏： Android Android libc中dns部分源码分析文章标签： DNS 查询统计

本文链接：https://blog.youkuaiyun.com/xiaoyu_750516366/article/details/82817399

Android 同时被 2 个专栏收录

13 篇文章

订阅专栏

Android libc中dns部分源码分析

10 篇文章

订阅专栏

本文深入解析DNS服务器的统计信息结构，包括样本记录、统计参数及统计信息的定义。阐述了统计信息的初始化、清空、添加样本过程，以及DNS服务器地址的可用性判定和惩罚机制。详细说明了如何通过成功率、样本数量和惩罚时间来决定是否启用惩罚机制，防止低效DNS服务器拖慢网络性能。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

数据结构

统计信息同样是基于网卡的，所以理所当然的，这些信息保存在了resolv_cache_info中，该结构中与统计有关的信息如下：

struct resolv_cache_info {
    struct __res_params         params;
    //每个DNS服务器地址都有一个自己的统计信息
    struct __res_stats          nsstats[MAXNS];
};

统计参数的定义如下，这几个参数的用法及其含义见下文分析：

/* per-netid configuration parameters passed from netd to the resolver */
struct __res_params {
    uint16_t sample_validity; // sample lifetime in s
    // threshold of success / total samples below which a server is considered broken
    uint8_t success_threshold; // 0: disable, value / 100 otherwise
    uint8_t min_samples; // min # samples needed for statistics to be considered meaningful
    //__res_stats.samples中最多可以保存多少个样本，该值应该不能超过MAXNSSAMPLES，否则就数组越界了
    uint8_t max_samples; // max # samples taken into account for statistics
};

统计信息的定义如下：

/*
 * Resolver reachability statistics and run-time parameters.
 */
//称之为统计样本吧
struct __res_sample {
	//该值是DNS请求报文被发送的时间，墙上时钟表示
    time_t			at;    // time in s at which the sample was recorded
    //如果收到了响应，那么为请求耗时，单位为毫秒；如果没有收到响应，那么该值为0
    uint16_t			rtt;   // round-trip time in ms
    //响应报文中的返回码
    uint8_t			rcode; // the DNS rcode or RCODE_XXX defined above
};

#define MAXNSSAMPLES		64	/* max # samples to store per server */

struct __res_stats {
    // 保存样本，作为环形数组使用
    struct __res_sample		samples[MAXNSSAMPLES];
    // 当前环形数组中保存的赝本数
    uint8_t			sample_count;
    // 下一个样本应该保存到samples[]的哪个位置
    uint8_t			sample_next;
};

要强调的是，对于每个网卡，统计参数只有一套，但是统计信息是根据DNS服务器地址分别记录的。

基本操作

初始化

初始化是在设置DNS服务器地址的时候完成的，其中相关代码如下：

int
_resolv_set_nameservers_for_net(unsigned netid, const char** servers, unsigned numservers,
        const char *domains, const struct __res_params* params)
{
    pthread_once(&_res_cache_once, _res_cache_init);
    pthread_mutex_lock(&_res_cache_list_lock);

    //分配resolv_cache_info结构，当然包括统计信息和统计参数
    _get_res_cache_for_net_locked(netid);

    if (cache_info != NULL) {
        uint8_t old_max_samples = cache_info->params.max_samples;
        if (params != NULL) {
        	//如果FWK有设置统计参数，那么使用FWK指定的
            cache_info->params = *params;
        } else {
        	//FWK没有指定，使用默认的
            _resolv_set_default_params(&cache_info->params);
        }

		//设置DNS地址或者修改了统计参数，那么清除统计信息
        if (!_resolv_is_nameservers_equal_locked(cache_info, servers, numservers)) {
            // Clear the NS statistics because the mapping to nameservers might have changed.
            _res_cache_clear_stats_locked(cache_info);
        } else if (cache_info->params.max_samples != old_max_samples) {
            // If the maximum number of samples changes, the overhead of keeping the most recent
            // samples around is not considered worth the effort, so they are cleared instead. All
            // other parameters do not affect shared state: Changing these parameters does not
            // invalidate the samples, as they only affect aggregation and the conditions under
            // which servers are considered usable.
            _res_cache_clear_stats_locked(cache_info);
        }
    }

    pthread_mutex_unlock(&_res_cache_list_lock);
    return 0;
}

清空统计信息_resolv_set_default_params()

static void _res_cache_clear_stats_locked(struct resolv_cache_info* cache_info) {
    if (cache_info) {
    	//将每个server对应的统计全部去清零
        for (int i = 0 ; i < MAXNS ; ++i) {
            cache_info->nsstats->sample_count = cache_info->nsstats->sample_next = 0;
        }
    }
}

添加样本

在res_nsend()中，如果查询结束，会调用_resolv_cache_add_resolver_stats_sample()将样本加入到cache中，代码如下：

int res_nsend(res_state statp,
	  const u_char *buf, int buflen, u_char *ans, int anssiz)

	//注意，这里只统计第一轮的查询结果
    /* Only record stats the first time we try a query. See above. */
    if (try == 0) {
        struct __res_sample sample;
        //用参数now、rcode、delay设置样本sample
        _res_stats_set_sample(&sample, now, rcode, delay);
        //将样本加入到缓存中
        _resolv_cache_add_resolver_stats_sample(statp->netid, revision_id,
            ns, &sample, params.max_samples);
    }
}

void _resolv_cache_add_resolver_stats_sample( unsigned netid, int revision_id, int ns,
       const struct __res_sample* sample, int max_samples) {
    if (max_samples <= 0) return;

    pthread_mutex_lock(&_res_cache_list_lock);

    struct resolv_cache_info* info = _find_cache_info_locked(netid);
	//找到对应的cache，并且二者的revision_id是一致，这种比较是防止在一个DNS请求过程中DNS信息被修改过
    if (info && info->revision_id == revision_id) {
        _res_cache_add_stats_sample_locked(&info->nsstats[ns], sample, max_samples);
    }

    pthread_mutex_unlock(&_res_cache_list_lock);
}

static void
_res_cache_add_stats_sample_locked(struct __res_stats* stats, const struct __res_sample* sample,
        int max_samples) {
    // Note: This function expects max_samples > 0, otherwise a (harmless) modification of the
    // allocated but supposedly unused memory for samples[0] will happen
    XLOG("%s: adding sample to stats, next = %d, count = %d", __FUNCTION__,
            stats->sample_next, stats->sample_count);
    //保存当前样本
    stats->samples[stats->sample_next] = *sample;
    //样本数不能超过配置参数中指定的最大样本数
    if (stats->sample_count < max_samples) {
        ++stats->sample_count;
    }
    //从这里可以看出，stats->samples[]是作为环形数组使用的，并且stats->sample_next指向的就是下一个要
    //赋值的样本的索引
    if (++stats->sample_next >= max_samples) {
        stats->sample_next = 0;
    }
}

惩罚机制

前面介绍的都是统计信息的数据结构以及它们是如何保存的，但是还没有看保存这些信息到底要干什么？这些信息实际上会在res_nsend()中使用，下面先看代码实现，然后再来总结这种机制。

res_nsend()

res_nsend()中有如下代码片段：

int res_nsend(res_state statp, const u_char *buf, int buflen, u_char *ans, int anssiz)
{
	/*
	 * Send request, RETRY times, or until successful.
	 */
	for (try = 0; try < statp->retry; try++) {
	    struct __res_stats stats[MAXNS];
	    struct __res_params params;
        //获取当前resolv_cache中的统计参数、统计信息以及revision_id
	    int revision_id = _resolv_cache_get_resolver_stats(statp->netid, &params, stats);
        //下面的函数会决定各个DNS服务器地址是否可用，是否可用都设置到usable_servers[]中
	    bool usable_servers[MAXNS];
	    android_net_res_stats_get_usable_servers(&params, stats, statp->nscount,
		    usable_servers);
		//在遍历各个DNS服务器地址时，如果已经标记该服务器地址不可用，则直接跳过，
        //所以我们称这种机制为惩罚机制(不喜勿喷)
	    for (ns = 0; ns < statp->nscount; ns++) {
			if (!usable_servers[ns])
            	continue;
            }
        }
    }
}

先来看看当前统计参数和统计信息的获取代码：

int
_resolv_cache_get_resolver_stats( unsigned netid, struct __res_params* params,
        struct __res_stats stats[MAXNS]) {
    int revision_id = -1;
    pthread_mutex_lock(&_res_cache_list_lock);

    struct resolv_cache_info* info = _find_cache_info_locked(netid);
    if (info) {
    	//完全正确，要获取的信息全部来自于resolv_cache_info
        memcpy(stats, info->nsstats, sizeof(info->nsstats));
        *params = info->params;
        revision_id = info->revision_id;
    }

    pthread_mutex_unlock(&_res_cache_list_lock);
    return revision_id;
}

下面重点来看到底是如何判断DNS服务器地址是否可用的。

DNS服务器地址的可用性判定

void
android_net_res_stats_get_usable_servers(const struct __res_params* params,
        struct __res_stats stats[], int nscount, bool usable_servers[]) {
    //统计总共有多少个地址是可用的
    unsigned usable_servers_found = 0;
    for (int ns = 0; ns < nscount; ns++) {
    	//具体的一个服务器地址是否可用有下面的函数决定
        bool usable = _res_stats_usable_server(params, &stats[ns]);
        if (usable) {
            ++usable_servers_found;
        }
        usable_servers[ns] = usable;
    }
    // If there are no usable servers, consider all of them usable.
    // TODO: Explore other possibilities, such as enabling only the best N servers, etc.
    //如注释所述，如果上面的逻辑判断所有的DNS地址都不可用，那么为了保证至少有DNS服务器地址可用，
    //这种情况下会将所有的地址都置为可用。显然这是一种防止惩罚过度的手段
    if (usable_servers_found == 0) {
        for (int ns = 0; ns < nscount; ns++) {
            usable_servers[ns] = true;
        }
    }
}

单个DNS服务器地址的可用性判定

bool _res_stats_usable_server(const struct __res_params* params, struct __res_stats* stats)
{
    int successes = -1;
    int errors = -1;
    int timeouts = -1;
    int internal_errors = -1;
    int rtt_avg = -1;
    time_t last_sample_time = 0;

    //该函数实际上是非常简单的，就是统计stats中：DNS查询成功的次数、查询失败次数、查询超时次数、查询过程中
    //发生了内部错误(缓存区太小等)的次数、查询成功时的平均RTT时延、最后一次添加统计样本的时间戳
    android_net_res_stats_aggregate(stats, &successes, &errors, &timeouts, &internal_errors,
            &rtt_avg, &last_sample_time);

	//进行门限判断
    if (successes >= 0 && errors >= 0 && timeouts >= 0) {
    	//总的DNS查询次数，注意不包含内部错误，因为这种情况根本就不会发起DNS请求
        int total = successes + errors + timeouts;
		//1. 总的查询次数超过了统计参数中配置的min_samples门限-----样本要达到一定数量
        //2. 有查询失败的情况发生-----如果全部正确也确实没有什么要继续判定的必要
        if (total >= params->min_samples && (errors > 0 || timeouts > 0)) {
        	//计算DNS查询成功率，百分比
            int success_rate = successes * 100 / total;
			//如果成功率低于统计参数中设定的成功率门限，那么需要惩罚该DNS服务器地址
            if (success_rate < params->success_threshold) {
                // evNowTime() is used here instead of time() to stay consistent with the rest of
                // the code base
                time_t now = evNowTime().tv_sec;
                //如果从上次添加样本到当前时间已经超过了要惩罚的时间，那么就不需要惩罚了
                if (now - last_sample_time > params->sample_validity) {
                    // Note: It might be worth considering to expire old servers after their expiry
                    // date has been reached, however the code for returning the ring buffer to its
                    // previous non-circular state would induce additional complexity.
                    //虽然不惩罚了，但是该DNS服务其之前的统计信息要清除
                    _res_stats_clear_samples(stats);
                } else {
                	//需要惩罚并且还没有超过惩罚时间，那么禁用该DNS服务器地址
                    return 0;
                }
            }
        }
    }
    //其它所有的情况，该DNS服务器地址都是可用的
    return 1;
}