1. 程式人生 > >netfilter之nat程式碼分析

netfilter之nat程式碼分析

nat主要在PRE_ROUTING、OUTING、LOCAL_IN、POST_ROUTING四個鏈上註冊了hook函式,PRE_ROUTING、OUTING這個兩個鏈上做DNAT,LOCAL_IN和POST_ROUTING鏈上做SNAT。nat表沒有LOCAL_IN鏈,但在LOCAL_IN上註冊了鉤子函式nf_nat_fn,主要作用是修改資料包的源埠。

static struct nf_hook_ops nf_nat_ops[] __read_mostly = {
	/* Before packet filtering, change destination */
	{
		/*做dnat*/
		.hook		= nf_nat_in,
		.owner		= THIS_MODULE,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_PRE_ROUTING,
		.priority	= NF_IP_PRI_NAT_DST,
	},
	/* After packet filtering, change source */
	{
		/*做snat*/
		.hook		= nf_nat_out,
		.owner		= THIS_MODULE,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_POST_ROUTING,
		.priority	= NF_IP_PRI_NAT_SRC,
	},
	/* Before packet filtering, change destination */
	{
		/*做dnat*/
		.hook		= nf_nat_local_fn,
		.owner		= THIS_MODULE,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_LOCAL_OUT,
		.priority	= NF_IP_PRI_NAT_DST,
	},
	/* After packet filtering, change source */
	{
		.hook		= nf_nat_fn,
		.owner		= THIS_MODULE,
		.pf		= NFPROTO_IPV4,
		.hooknum	= NF_INET_LOCAL_IN,
		.priority	= NF_IP_PRI_NAT_SRC,
	},
};

1、nf_nat_in

nf_nat_in鉤子函式註冊在PRE_ROUTING鏈上,最終做DNAT的處理函式是nf_nat_fn,這個函式後面再講,做了DNAT後目的地址改變而且資料包沒有被扔掉就呼叫skb_dst_drop,這個最終呼叫dst_release,將skb->dst設定為NULL,將skb的dst_entry減1

static unsigned int
nf_nat_in(unsigned int hooknum,
	  struct sk_buff *skb,
	  const struct net_device *in,
	  const struct net_device *out,
	  int (*okfn)(struct sk_buff *))
{
	unsigned int ret;
	__be32 daddr = ip_hdr(skb)->daddr;

	/*最終做dnat的處理函式*/
	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
	if (ret != NF_DROP && ret != NF_STOLEN &&
	    daddr != ip_hdr(skb)->daddr)
	    /*目的地地址改變要將skb->dst設定為NULL*/
		skb_dst_drop(skb);

	return ret;
}

2、nf_nat_out

nf_nat_out註冊在POST_ROUTING鏈上,實現的功能是做SNAT,最終處理的函式也是nf_nat_fn。

static unsigned int
nf_nat_out(unsigned int hooknum,
	   struct sk_buff *skb,
	   const struct net_device *in,
	   const struct net_device *out,
	   int (*okfn)(struct sk_buff *))
{
#ifdef CONFIG_XFRM
	const struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;
#endif
	unsigned int ret;

	/* root is playing with raw sockets. */
	if (skb->len < sizeof(struct iphdr) ||
	    ip_hdrlen(skb) < sizeof(struct iphdr))
		return NF_ACCEPT;
	/*做SNAT*/
	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
#ifdef CONFIG_XFRM
	if (ret != NF_DROP && ret != NF_STOLEN &&
	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

		if ((ct->tuplehash[dir].tuple.src.u3.ip !=
		     ct->tuplehash[!dir].tuple.dst.u3.ip) ||
		    (ct->tuplehash[dir].tuple.src.u.all !=
		     ct->tuplehash[!dir].tuple.dst.u.all)
		   )
			return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
	}
#endif
	return ret;
}

3、nf_nat_local_fn

nf_nat_local_fn註冊在OUTING鏈上,最終也是呼叫nf_nat_fn做DNAT,在OUTING鏈之前資料包已經做了路由選擇,因為做DNAT目的地地址改變所以要呼叫ip_route_me_hander重新選擇路由。

static unsigned int
nf_nat_local_fn(unsigned int hooknum,
		struct sk_buff *skb,
		const struct net_device *in,
		const struct net_device *out,
		int (*okfn)(struct sk_buff *))
{
	const struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;
	unsigned int ret;

	/* root is playing with raw sockets. */
	if (skb->len < sizeof(struct iphdr) ||
	    ip_hdrlen(skb) < sizeof(struct iphdr))
		return NF_ACCEPT;

	/*做DNAT*/
	ret = nf_nat_fn(hooknum, skb, in, out, okfn);
	if (ret != NF_DROP && ret != NF_STOLEN &&
	    (ct = nf_ct_get(skb, &ctinfo)) != NULL) {
		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);

		if (ct->tuplehash[dir].tuple.dst.u3.ip !=
		    ct->tuplehash[!dir].tuple.src.u3.ip) {
		    /*做DNAT後目的地址改變要重新選路由*/
			if (ip_route_me_harder(skb, RTN_UNSPEC))
				ret = NF_DROP;
		}
#ifdef CONFIG_XFRM
		else if (ct->tuplehash[dir].tuple.dst.u.all !=
			 ct->tuplehash[!dir].tuple.src.u.all)
			if (ip_xfrm_me_harder(skb))
				ret = NF_DROP;
#endif
	}
	return ret;
}

4、nf_nat_fn

1nf_nat_fn對資料包的連線跟蹤選項的NAT只做一次,後續的資料包根據連結跟蹤做NAT。nf_nat_fn主要做以下幾件事

(1)判斷資料包的連結跟蹤是否建立,如果沒有建立直接返回,如果連結跟蹤沒有關聯nf_conn_nat也返回

(2)如果資料包狀態是一個期望連結或者有reply方向,而且協議是icmp就呼叫nf_nat_icmp_reply_translation對imcp做nat

(3)如果資料包的狀態是IP_CT_NEW,就呼叫nf_nat_initialized判斷該資料包的連結跟蹤是否已經做 NAT,如果還沒有做NAT而且是LOCAL_IN鏈上的鉤子函式,就呼叫alloc_null_binding修改連結跟蹤reply方向

(4)呼叫函式nf_nat_rule_find查詢nat表最後由nf_nat_packet根據連結跟蹤做nat

static unsigned int
nf_nat_fn(unsigned int hooknum,
	  struct sk_buff *skb,
	  const struct net_device *in,
	  const struct net_device *out,
	  int (*okfn)(struct sk_buff *))
{
	struct nf_conn *ct;
	enum ip_conntrack_info ctinfo;
	struct nf_conn_nat *nat;
	/* maniptype == SRC for postrouting. */
	enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);

	/* We never see fragments: conntrack defrags on pre-routing
	   and local-out, and nf_nat_out protects post-routing. */
	NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));

	/*獲取連結跟蹤和資料包狀態ctinfo*/
	ct = nf_ct_get(skb, &ctinfo);
	/* Can't track?  It's not due to stress, or conntrack would
	   have dropped it.  Hence it's the user's responsibilty to
	   packet filter it out, or implement conntrack/NAT for that
	   protocol. 8) --RR */
	if (!ct)
		return NF_ACCEPT;

	/* Don't try to NAT if this packet is not conntracked */
	/*不做連結跟蹤的直接返回*/
	if (ct == &nf_conntrack_untracked)
		return NF_ACCEPT;

	/*連結跟蹤沒有關聯nf_conn_nat直接返回*/
	nat = nfct_nat(ct);
	if (!nat) {
		/* NAT module was loaded late. */
		/*連結跟蹤已經確認就返回*/
		if (nf_ct_is_confirmed(ct))
			return NF_ACCEPT;
		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
		if (nat == NULL) {
			pr_debug("failed to add NAT extension\n");
			return NF_ACCEPT;
		}
	}

	switch (ctinfo) {
	case IP_CT_RELATED:
	case IP_CT_RELATED+IP_CT_IS_REPLY:
		if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
			/*對於一個期望連結或者有reply方向而且協議是
			icmp就呼叫nf_nat_imcp_reply_translation做nat*/
			if (!nf_nat_icmp_reply_translation(ct, ctinfo,
							   hooknum, skb))
				return NF_DROP;
			else
				return NF_ACCEPT;
		}
		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
	case IP_CT_NEW:

		/* Seen it before?  This can happen for loopback, retrans,
		   or local packets.. */
		  /*判斷連線跟蹤是否已經做過NAT*/
		if (!nf_nat_initialized(ct, maniptype)) {
			unsigned int ret;

			/*如果資料包走到了LOCAL_IN鏈而且狀態是NEW就要
			做修改連結跟蹤的reply方向*/
			if (hooknum == NF_INET_LOCAL_IN)
				/* LOCAL_IN hook doesn't have a chain!  */
				ret = alloc_null_binding(ct, hooknum);
			else
				/*查詢nat表,判斷是否已經做nat*/
				ret = nf_nat_rule_find(skb, hooknum, in, out,
						       ct);

			if (ret != NF_ACCEPT)
				return ret;
		} else
			pr_debug("Already setup manip %s for ct %p\n",
				 maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
				 ct);
		break;

	default:
		/* ESTABLISHED */
		NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
			     ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
	}

	/*根據連結跟蹤的tuple修改資料包做nat*/
	return nf_nat_packet(ct, ctinfo, hooknum, skb);
}

4.1 nf_nat_initialized

nf_nat_initlized判斷連結跟蹤選項是否做了NAT,如做了NAT那麼ct->status就會設定IPS_SRC_NAT_DONE_BIT、IPS_SRC_NAT_DONE_BIT。

static inline int nf_nat_initialized(struct nf_conn *ct,
				     enum nf_nat_manip_type manip)
{
	if (manip == IP_NAT_MANIP_SRC)
		return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
	else
		return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
}

4.2 alloc_null_binding

當資料包的狀態是IP_CT_NEW並且是LOCAL_IN鏈上的就呼叫alloc_null_bingding對連結跟蹤做NAT修改reply方向,因為LOCAL_IN是netfileter框架的的一個出口,如果這時連結跟蹤沒做NAT那麼資料包出去就會有問題。

unsigned int
alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
	/* Force range to this IP; let proto decide mapping for
	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
	   Use reply in case it's already been mangled (eg local packet).
	*/
	__be32 ip
		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
		   ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
		   : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
	struct nf_nat_range range
		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };

	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
	/*連結跟蹤做nat,修改tuple的reply方向*/
	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}

4.3 nf_nat_rule_find

連結狀態是IP_CTNEW、IP_CT_RELATED、IP_CT_RELATED+IP_CT_IS_REPLY,而且不是在LOCAL_IN上就呼叫nf_nat_rule_find查詢NAT表匹配規則,找到就呼叫相應的target函式(ipt_snat_target或者ipt_dnat_target)實現連線跟蹤項的轉換。然如果沒有找到就呼叫alloc_null_binding做連結跟蹤的NAT。alloc_null_bingding實際呼叫的是nf_nat_setup_info,這個函式下一節再分析。

int nf_nat_rule_find(struct sk_buff *skb,
		     unsigned int hooknum,
		     const struct net_device *in,
		     const struct net_device *out,
		     struct nf_conn *ct)
{
	struct net *net = nf_ct_net(ct);
	int ret;

	/*查詢nat表匹配的規則做NAT*/
	ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);

	if (ret == NF_ACCEPT) {
		/*判斷有沒有做NAT*/
		if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
			/* NUL mapping 改變tuple的reply方向*/
			ret = alloc_null_binding(ct, hooknum);
	}
	return ret;
}

5、nf_nat_packet

當資料包的連結跟蹤已經做了NAT,就呼叫nf_nat_packet根據連結跟蹤修改資料包的ip、埠做NAT。這個函式很巧妙,此時連結跟蹤已經做了NAT,就取dir的反方向的tuple,然後再對tuple中的源ip、目的ip、源埠、目的埠顛倒,得到target,最後呼叫manip_pkt做NAT修改資料包的ip地址和埠。

這個地方有點繞舉一個例子:比如一個閘道器112.112.112.112,它下面的區域網有一個A裝置ip是192.168.0.100,這個A裝置要訪問一個外網伺服器地址是113.113.113.113,這樣必須由做SNAT,首先連結跟蹤做了SNAT後tuple如下

orig
src dst
192.168.0.100 113.113.113.113
reply
src dst
113.113.113.113 112.112.112.112

裝置A的資料包訪問伺服器是orig方向:192.168.0.100  -> 113.113.113.113,呼叫nf_nat_packet取反也就是reply:113.113.113.113 -> 12.112.112.112再顛倒過來得到target:12.112.112.112  -> 113.113.113.113,然後將target:112.112.112.112 -> 113.113.113.113修改資料包的源Ip、目的ip完成SNAT轉換。

當外部伺服器有資料包reply:113.113.113->112.112.112.112,呼叫nf_nat_packet取反方向也就是orig 192.168.0.100->113.113.113.113再顛倒過來得到target:13.113.113.113.->192.168.0.100然後將target:113.113.113.113 -> 192.168.0.100修改資料包的源IP、目的IP。

比如110.110.110.110的閘道器地址要做DNAT到內部一個地址192.168..0.200,一個外網地址111.111.111.111訪問閘道器110.110.110.110就會做DNAT到192.168.0.200,連結跟蹤做DNAT後tuple如下

orig
src dst
111.111.111.111 110.110.110.110

 

reply
src dst
192.168.0.200 111.111.111.111

 

當外網地址訪問閘道器也就是orig方向:111.111.111.111->110.110.110.110,呼叫nf_nat_packet會取相反方向的tuple也就是reply:192.168.0.200->111.111.111.111,然顛倒得到target:111.111.111.111->192.168.0.200然後修改源Ip、目的ip完成dnat轉換。

192.168.0.200有回覆包也就是reply方向:192.168.0.200->111.111.111.111,呼叫nf_nat_packet會取相反方向的tuple也就是orig:111.111.111.111->110.110.110.110,然後顛倒得到target:110.110.110.110->111.111.111.111,修改資料包的源地址、目的地地址完成reply。

所以說NAT起始就是基於連結跟蹤實現的。

/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
			   enum ip_conntrack_info ctinfo,
			   unsigned int hooknum,
			   struct sk_buff *skb)
{
	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
	unsigned long statusbit;
	enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);

	if (mtype == IP_NAT_MANIP_SRC)
		statusbit = IPS_SRC_NAT;
	else
		statusbit = IPS_DST_NAT;

	/* Invert if this is reply dir. */
	if (dir == IP_CT_DIR_REPLY)
		statusbit ^= IPS_NAT_MASK;

	/* Non-atomic: these bits don't change. */
	if (ct->status & statusbit) {
		struct nf_conntrack_tuple target;

		/* We are aiming to look like inverse of other direction. */
		/*取dir的反方向的tuple,然後把該tuple的源ip、目的ip
		源port、目的port顛倒過來得到target*/
		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);

		/*根據連結跟蹤的target做nat*/
		if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
			return NF_DROP;
	}
	return NF_ACCEPT;
}

6、manip_pkt

manip_pkt主要根據傳進來的target和mainiptype完成三層、四層的NAT轉換。先獲取四層的struct nf_nat_protocol 結構體例項然後呼叫四層協議的manip_pkt完成四層埠的NAT,

static bool
manip_pkt(u_int16_t proto,
	  struct sk_buff *skb,
	  unsigned int iphdroff,
	  const struct nf_conntrack_tuple *target,
	  enum nf_nat_manip_type maniptype)
{
	struct iphdr *iph;
	const struct nf_nat_protocol *p;

	if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
		return false;

	iph = (void *)skb->data + iphdroff;

	/* Manipulate protcol part. */

	/* rcu_read_lock()ed by nf_hook_slow */
	/*獲取nat四層轉換結構體例項*/
	p = __nf_nat_proto_find(proto);
	/*四層協議的NAT轉換*/
	if (!p->manip_pkt(skb, iphdroff, target, maniptype))
		return false;

	iph = (void *)skb->data + iphdroff;

	if (maniptype == IP_NAT_MANIP_SRC) {
		csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
		/*snat改變源地址*/
		iph->saddr = target->src.u3.ip;
	} else {
		/*dnat改變目的地址*/
		csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
		iph->daddr = target->dst.u3.ip;
	}
	return true;
}