浅析Linux Kernel 哈希路由表实现(二)

- 三月 09, 2013

Han 通过 Google 阅读器发送给您的内容：

浅析Linux Kernel 哈希路由表实现(二)

于 11-3-15 通过 basic coder 作者：levin

在向外发送数据包的时候，首先需要查询路由表来确定路由包的路由，主要由ip_route_output_key()函数来完成，该函数又调用了ip_route_output_flow()，而这个函数最终又调用了__ip_route_output_key()这个函数来进行路由的查询，下面主要来看一下这个函数：

int __ip_route_output_key(struct net *net, struct rtable **rp,  			  const struct flowi *flp)  {  	unsigned int hash;  	int res;  	struct rtable *rth;     	if (!rt_caching(net))  		goto slow_output;     	hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));     	rcu_read_lock_bh();  	for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;  		rth = rcu_dereference_bh(rth->dst.rt_next)) {  		if (rth->fl.fl4_dst == flp->fl4_dst &&  		    rth->fl.fl4_src == flp->fl4_src &&  		    rth->fl.iif == 0 &&  		    rth->fl.oif == flp->oif &&  		    rth->fl.mark == flp->mark &&  		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &  			    (IPTOS_RT_MASK | RTO_ONLINK)) &&  		    net_eq(dev_net(rth->dst.dev), net) &&  		    !rt_is_expired(rth)) {  			dst_use(&rth->dst, jiffies);  			RT_CACHE_STAT_INC(out_hit);  			rcu_read_unlock_bh();  			*rp = rth;  			return 0;  		}  		RT_CACHE_STAT_INC(out_hlist_search);  	}  	rcu_read_unlock_bh();     slow_output:  	rcu_read_lock();  	res = ip_route_output_slow(net, rp, flp);  	rcu_read_unlock();  	return res;  }

Linux的路由表中的常用路由是存储在路由缓存中的，该路由缓存即是类型为struct rt_hash_bucket的全局列表rt_hash_table，该缓存列表在ip_rt_init()中初始化。

struct flowi结构中包含了查询路由表所需要的请求信息，是一个搜索健值。由代码可看出，首先在路由缓存列表rt_hash_table中查询精确匹配的未过期的路由表项struct rtable，(注，因为是出口路由，所以入口接口号是0)，若找到后增加路由表项的引用计数和后即刻返回。若未找到匹配的路由表项，则继续在路由表中查找匹配的路由表项，路由表中的查询速度会比路由缓存中慢，所以ip_route_output_slow()函数的命名就不难理解了，主动的路由解析工作都是在这个函数里面进行的，在看它的定义之前先看下服务类型和路由范围的相关定义：

#define IPTOS_TOS_MASK		0x1E  #define IPTOS_TOS(tos)		((tos)&IPTOS_TOS_MASK)  #define	IPTOS_LOWDELAY		0x10	/* 最小延时 */  #define	IPTOS_THROUGHPUT	0x08	/* 最大吞吐量 */  #define	IPTOS_RELIABILITY	0x04	/* 最高可靠性 */  #define	IPTOS_MINCOST		0x02	/* 最小消费 */  #define RTO_ONLINK          0x01

由掩码可知，服务类型实际上用了从第2位到第5位共四位的数据，表示四种服务类型，而最低位的RTO_ONLINK如果置位，则scope为RT_SCOPE_LINK，或没有，则scope为RT_SCOPE_UNIVERSE，接下来看看scope的相关定义：

enum rt_scope_t {  	RT_SCOPE_UNIVERSE=0,		/* 表示在空间中的任何位置 */  /* User defined values  */  	RT_SCOPE_SITE=200,  	RT_SCOPE_LINK=253,			/* 与本地直接相连的地址 */  	RT_SCOPE_HOST=254,			/* 本地地址 */  	RT_SCOPE_NOWHERE=255		/* 不可达的地址 */  };

其中值越大所表示的范围便越精确，实际上这也不是什么范围的意思，只不过是到目的地址的某种距离的表示。OK，接下来看ip_route_output_slow()函数的定义：

static int ip_route_output_slow(struct net *net, struct rtable **rp,  				const struct flowi *oldflp)  {  	u32 tos	= RT_FL_TOS(oldflp);  	struct flowi fl = { .nl_u = { .ip4_u =  				      { .daddr = oldflp->fl4_dst,  					.saddr = oldflp->fl4_src,  					.tos = tos & IPTOS_RT_MASK,  					.scope = ((tos & RTO_ONLINK) ?  						  RT_SCOPE_LINK :  						  RT_SCOPE_UNIVERSE),  				      } },  			    .mark = oldflp->mark,  			    .iif = net->loopback_dev->ifindex,  			    .oif = oldflp->oif };  	struct fib_result res;  	unsigned int flags = 0;  	struct net_device *dev_out = NULL;  	int err;        	res.fi		= NULL;  #ifdef CONFIG_IP_MULTIPLE_TABLES  	res.r		= NULL;

搜索此博客

Oenhan Google Reader