1. 程式人生 > >TCP協議和套接字、IP層之間的介面

TCP協議和套接字、IP層之間的介面

1、TCP和套接字層之間的介面

TCP和套接字之間的介面資料結構是struct proto,這個結構體的元素是一系列的函式指標,從tcp_close到tcp_shutdown函式是tcp連線管理處理函式。TCP資料接受函式是tcp_recvmsg和tcp_v4_do_rcv函式實現。struct proto_tcp_prot定義如下:

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,			//套接字關閉	
	.connect		= tcp_v4_connect,		//練級處理	
	.disconnect		= tcp_disconnect,		//斷開處理
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,		//初始化套接字
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,		//立即關閉套接字
	.setsockopt		= tcp_setsockopt,		//設定選項	
	.getsockopt		= tcp_getsockopt,	//獲取選項
	.recvmsg		= tcp_recvmsg,			//套接字層接受資料包函式
	.backlog_rcv		= tcp_v4_do_rcv,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_mem		= sysctl_tcp_mem,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_DESTROY_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
};

TCP和套接字層的介面資料結構在AF_INET協議族套接字初始化函式inet_init呼叫proto_register函式註冊,

static int __init inet_init(void)
{
...

	//註冊tcp協議例項
	rc = proto_register(&tcp_prot, 1);
...

}

2、TCP和IP層之間的介面

2.1、TCP和IP層之間的接受介面

TCP協議和IP層的資料結構介面是struct net_protocol,struct net_protocol tcp_protocol如下:

static const struct net_protocol tcp_protocol = {
	.handler =	tcp_v4_rcv,			//接受IP層資料包處理函式
	.err_handler =	tcp_v4_err,			//icmp錯誤處理函式
	.gso_send_check = tcp_v4_gso_send_check,
	.gso_segment =	tcp_tso_segment,
	.gro_receive =	tcp4_gro_receive,
	.gro_complete =	tcp4_gro_complete,
	.no_policy =	1,
	.netns_ok =	1,
};

IP層通過ip_local_deliver_finish函式處理後將資料包上傳到傳輸層,是根據協議號proto在inet_protos全域性陣列中找打傳輸層的接受函式。TCP和IP層的結構資料結構struct net_protocol tcp_protocol的註冊在inet_init函式中呼叫inet_add_protocol儲存到全域性陣列inet_protos中。

static int __init inet_init(void)
{
...

	//註冊傳輸層的處理函式到inet_protos全域性陣列中
	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
		printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
...

}

2.2、TCP和IP層之間的傳送介面

TCP和IP層之間的傳送介面資料結構是struct inet_connection_sock_af_ops,TCP的的struct inet_connection_sock_af_ops的資料結構的例項是ipv4_specific,包含了一組AF_INET地址族中TCP協議例項操作函式,其目的是實現一組IPv4和IPv6都可以共享TCP和網路層之間的介面。

struct inet_connection_sock_af_ops:

struct inet_connection_sock_af_ops {
	int	    (*queue_xmit)(struct sk_buff *skb);				//傳送資料到網路層
	void	    (*send_check)(struct sock *sk, struct sk_buff *skb);	//傳送資料段校驗和
	int	    (*rebuild_header)(struct sock *sk);				//建立TCP協議頭
	int	    (*conn_request)(struct sock *sk, struct sk_buff *skb);
	struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb,
				      struct request_sock *req,
				      struct dst_entry *dst);
	int	    (*remember_stamp)(struct sock *sk);
	u16	    net_header_len;
	u16	    sockaddr_len;
	int	    (*setsockopt)(struct sock *sk, int level, int optname, 
				  char __user *optval, unsigned int optlen);
	int	    (*getsockopt)(struct sock *sk, int level, int optname, 
				  char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPAT
	int	    (*compat_setsockopt)(struct sock *sk,
				int level, int optname,
				char __user *optval, unsigned int optlen);
	int	    (*compat_getsockopt)(struct sock *sk,
				int level, int optname,
				char __user *optval, int __user *optlen);
#endif
	void	    (*addr2sockaddr)(struct sock *sk, struct sockaddr *);
	int	    (*bind_conflict)(const struct sock *sk,
				     const struct inet_bind_bucket *tb);
};

tcp協議struct inet_connection_sock_af_ops資料結構的例項是ipv4_specific

ipv4_specific:

const struct inet_connection_sock_af_ops ipv4_specific = {
    //向IPv4網路層傳送函式
	.queue_xmit	   = ip_queue_xmit,  
    //計算髮送資料段校驗和      
	.send_check	   = tcp_v4_send_check,
    //建立TCP頭部
	.rebuild_header	   = inet_sk_rebuild_header,
    //處理連線請求資料段
	.conn_request	   = tcp_v4_conn_request,
    //從另一端收到SYNACK回答後建立新的子套接字的函式
	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
    //儲存從某個站點收到最後一個數據包的時間戳
	.remember_stamp	   = tcp_v4_remember_stamp,
    //網路層協議頭的大小,設定為IPv4協議頭長度
	.net_header_len	   = sizeof(struct iphdr),
    //設定IPv4在網路層的套接字選項
	.setsockopt	   = ip_setsockopt,
    //獲取IPv4在網路層的套接字選項
	.getsockopt	   = ip_getsockopt,
    //為IPv4生成常規sockaddr_in型別地址
	.addr2sockaddr	   = inet_csk_addr2sockaddr,
    //IPv4的sockaddr_in型別地址大小
	.sockaddr_len	   = sizeof(struct sockaddr_in),
	.bind_conflict	   = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_ip_setsockopt,
	.compat_getsockopt = compat_ip_getsockopt,
#endif
};

ipv4_specific的註冊是呼叫tcp_v4_init_sock函式

static int tcp_v4_init_sock(struct sock *sk)
{
...
    icsk->icsk_af_ops = &ipv4_specific;
...

}

3、TCP、套接字、IP層之間介面函式

tcp、套接字、和ip層之間的介面函式關係如下圖,通過這張圖就能清晰知道TCP、套接字層、IP層資料包傳送接受流程。

TCP、套接字、IP層之間介面函式

4、TCP套接字初始化

當應用層開啟一個套接字後就會呼叫tcp_v4_init_sock函式初始化套接字,主要初始化TCP套接字結構,程式碼如下:

static int tcp_v4_init_sock(struct sock *sk)
{
	//獲取套接字指標
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct tcp_sock *tp = tcp_sk(sk);

	//初始化TCP輸出佇列out_of_order_queue
	skb_queue_head_init(&tp->out_of_order_queue);
	//初始化傳送超時時鐘
	tcp_init_xmit_timers(sk);
	//初始化輸入佇列prequeue
	tcp_prequeue_init(tp);

	//初始化重傳時間isc_rto和介質偏差時間mdev,設定為3秒
	icsk->icsk_rto = TCP_TIMEOUT_INIT;
	tp->mdev = TCP_TIMEOUT_INIT;

	/* So many TCP implementations out there (incorrectly) count the
	 * initial SYN frame in their delayed-ACK and congestion control
	 * algorithms that we must have the following bandaid to talk
	 * efficiently to them.  -DaveM
	 */
	tp->snd_cwnd = 2;

	/* See draft-stevens-tcpca-spec-01 for discussion of the
	 * initialization of these values.
	 */
	 //snd_ssthresh設定為32位有效禁止slow start演算法
	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
	//傳送阻塞視窗最大設定16位
	tp->snd_cwnd_clamp = ~0;
	//TCP最小段大小536
	tp->mss_cache = TCP_MSS_DEFAULT;

	//初始化TCP選項結構的重排序域recordering
	tp->reordering = sysctl_tcp_reordering;
	//初始化inet連線套接字阻塞管理操作函式
	icsk->icsk_ca_ops = &tcp_init_congestion_ops;

	//此時套接字的狀態還是close
	sk->sk_state = TCP_CLOSE;

	//指向套接字的回撥函式,當套接字的寫緩衝區有效
	//就呼叫該函式
	sk->sk_write_space = sk_stream_write_space;
	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);

	//註冊ipv4_specific
	icsk->icsk_af_ops = &ipv4_specific;
	icsk->icsk_sync_mss = tcp_sync_mss;
#ifdef CONFIG_TCP_MD5SIG
	tp->af_specific = &tcp_sock_ipv4_specific;
#endif

	/* TCP Cookie Transactions */
	if (sysctl_tcp_cookie_size > 0) {
		/* Default, cookies without s_data_payload. */
		tp->cookie_values =
			kzalloc(sizeof(*tp->cookie_values),
				sk->sk_allocation);
		if (tp->cookie_values != NULL)
			kref_init(&tp->cookie_values->kref);
	}
	/* Presumed zeroed, in order of appearance:
	 *	cookie_in_always, cookie_out_never,
	 *	s_data_constant, s_data_in, s_data_out
	 */
	 //設定傳送緩衝區和接受緩衝區大小,
	 //應用層可以呼叫setsockopt設定
	sk->sk_sndbuf = sysctl_tcp_wmem[1];
	sk->sk_rcvbuf = sysctl_tcp_rmem[1];

	local_bh_disable();
	//tcp_sockets_allocated是一個全域性變數
	//儲存的套接字數量,加1
	percpu_counter_inc(&tcp_sockets_allocated);
	local_bh_enable();

	return 0;
}