1. 程式人生 > >【Linux 核心網路協議棧原始碼剖析】socket.c——BSD Socket層(1)

【Linux 核心網路協議棧原始碼剖析】socket.c——BSD Socket層(1)

寫在前面:本系列文章先把各個層對應的檔案原始碼剖析一遍,最後再穿插起來,理清整個協議棧網路資料包的上下傳送通道,從整體實現上進行把握。

       

圖片來源於《Linux 核心網路棧原始碼情景分析》

更上層函式:tcp socket函式介紹。本篇則是介紹BSD Socket層。其對應函式集定義在socket.c 檔案中,閱讀原始碼後,你會發現這些函式都是層層巢狀呼叫表現出了上下層之間的關係。核心版本:Linux 1.2.13

原始碼剖析:

為方便大家理清思路,先介紹幾個中間函式。建議:像這些大型軟體專案,函式內通常還會呼叫一些公用的基礎類的工具函式,我們在閱讀原始碼時,應該先弄清楚這些函式,這樣當閱讀對應函式時,能很好地把握該函式的內部細節。

/*下面兩個函式實現地址使用者空間和核心空間地址之間的相互移動*/
//從uaddr拷貝ulen大小的資料到kaddr
static int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr)
{
	int err;
	if(ulen<0||ulen>MAX_SOCK_ADDR)
		return -EINVAL;
	if(ulen==0)
		return 0;
	//檢查使用者空間的指標所指的指定大小儲存塊是否可讀
	if((err=verify_area(VERIFY_READ,uaddr,ulen))<0)
		return err;
	memcpy_fromfs(kaddr,uaddr,ulen);//實質是memcpy函式
	return 0;
}
//注意的是,從核心拷貝資料到使用者空間是值-結果引數
//ulen這個指向某個整數變數的指標,當函式被呼叫的時候,它告訴核心需要拷貝多少
//函式返回時,該引數作為一個結果,告訴程序,核心實際拷貝了多少資訊
static int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen)
{
	int err;
	int len;

	//判斷ulen指向的儲存塊是否可寫,就是判斷ulen是否可作為左值	
	if((err=verify_area(VERIFY_WRITE,ulen,sizeof(*ulen)))<0)
		return err;
	len=get_fs_long(ulen);//len = *ulen,ulen作為值傳入,告訴要拷貝多少資料
	if(len>klen)
		len=klen;//供不應求,按供的算。實際拷貝的資料
	if(len<0 || len> MAX_SOCK_ADDR)
		return -EINVAL;
	if(len)
	{
	//判斷uaddr使用者空間所指的儲存塊是否可寫
		if((err=verify_area(VERIFY_WRITE,uaddr,len))<0)
			return err;
		memcpy_tofs(uaddr,kaddr,len);//實質是呼叫memcpy
	}
 	put_fs_long(len,ulen);//*ulen = len,作為結果返回,即實際拷貝了多少資料
 	return 0;
}
下面這個函式一看就知道什麼意思
static inline unsigned long get_user_long(const int *addr)
{
	return *addr;
}

#define get_fs_long(addr) get_user_long((int *)(addr))

為套接字分配檔案描述符,套接字其實同普通的檔案描述符差不多,分配檔案描述符的同時需要一個file結構,file結構中f_inode欄位指向inode(這裡的形參)
/*
 *	為網路套接字分配一個檔案描述符 
 */

static int get_fd(struct inode *inode)
{
	int fd;
	struct file *file;

	/*
	 *	Find a file descriptor suitable for return to the user. 
	 */

	file = get_empty_filp();//分配檔案物件,檔案描述符對應實體,file結構體指示一個開啟的檔案,filp:file pointer
	if (!file) 
		return(-1);
	//找到可用的檔案描述符
	for (fd = 0; fd < NR_OPEN; ++fd)
		if (!current->files->fd[fd]) 
			break;
	//沒有空閒可用的檔案描述符,則退出
	if (fd == NR_OPEN) 
	{
		file->f_count = 0;
		return(-1);
	}
	//在檔案描述符集合中刪除一個新的檔案描述符
	FD_CLR(fd, ¤t->files->close_on_exec);
		current->files->fd[fd] = file;//賦值,掛鉤
	file->f_op = &socket_file_ops;//指定操作函式集,實現了網路操作的普通檔案介面
	file->f_mode = 3;//許可權
	file->f_flags = O_RDWR;//標誌,可讀可寫
	file->f_count = 1;//引用計數
	file->f_inode = inode;//與檔案inode建立聯絡,inode為對檔案的索引
	if (inode) 
		inode->i_count++;//inode的引用計數也要增1
	file->f_pos = 0;//偏移值
	return(fd);
}

每個檔案描述符都與對應的inode結構關聯,通過檔案描述符可以找到file結構,通過file結構可以找到inode,而socket結構又是作為inode結構中的一個變數,反過來,inode也是作為socket結構的一個變數,分配套接字時,兩者之間需要建立關聯,見sock_alloc()。
/*
 *	通過inode結構查詢對應的socket結構
 */
inline struct socket *socki_lookup(struct inode *inode)
{
	return &inode->u.socket_i;//socket結構是作為inode結構中的一個變數
}

/*
 *	給定檔案描述符返回socket結構以及file結構指標
 */

static inline struct socket *sockfd_lookup(int fd, struct file **pfile)
{
	struct file *file;
	struct inode *inode;
	//有效性檢查,並從檔案描述符中得到對應的file結構
	if (fd < 0 || fd >= NR_OPEN || !(file = current->files->fd[fd])) 
		return NULL;
	//得到對應inode結構
	inode = file->f_inode;
	if (!inode || !inode->i_sock)
		return NULL;

	if (pfile) 
		*pfile = file;//引數返回file結構指標
	//返回inode對應的socket結構
	return socki_lookup(inode);
}
 下面開始socket結構的處理了

分配一個socket結構

/*
 *	分配一個socket結構
 */

struct socket *sock_alloc(void)
{
	struct inode * inode;
	struct socket * sock;

	inode = get_empty_inode();//分配一個inode物件
	if (!inode)
		return NULL;
	//獲得的inode結構的初始化
	inode->i_mode = S_IFSOCK;
	inode->i_sock = 1;
	inode->i_uid = current->uid;
	inode->i_gid = current->gid;

	sock = &inode->u.socket_i;
	sock->state = SS_UNCONNECTED;
	sock->flags = 0;
	sock->ops = NULL;
	sock->data = NULL;
	sock->conn = NULL;
	sock->iconn = NULL;
	sock->next = NULL;
	sock->wait = &inode->i_wait;
	sock->inode = inode;//回綁
	sock->fasync_list = NULL;
	sockets_in_use++;//系統當前使用的套接字數量加1
	return sock;
}

釋放(關閉)套接字
/*
 *	Release a socket.
 */
//釋放對端的套接字
static inline void sock_release_peer(struct socket *peer)
{
	peer->state = SS_DISCONNECTING;//狀態切換到正在處理關閉連線
	wake_up_interruptible(peer->wait);//喚醒指定的註冊在等待佇列上的程序
	sock_wake_async(peer, 1);//非同步喚醒,涉及到套接字狀態的改變,需要通知相應程序進行某種處理
}

/*
 *	釋放(關閉)一個套接字
 */

void sock_release(struct socket *sock)
{
	int oldstate;
	struct socket *peersock, *nextsock;

//只要套接字不是出於未連線狀態,就將其置為正在處理關閉連線狀態
	if ((oldstate = sock->state) != SS_UNCONNECTED)
		sock->state = SS_DISCONNECTING;

	/*
	 *	Wake up anyone waiting for connections. 
	 */
//iconn只用於伺服器端,表示等待連線但尚未完成連線的客戶端socket結構連結串列
	for (peersock = sock->iconn; peersock; peersock = nextsock) 
	{
		nextsock = peersock->next;
		sock_release_peer(peersock);
	}

	/*
	 * Wake up anyone we're connected to. First, we release the
	 * protocol, to give it a chance to flush data, etc.
	 */
	//如果該套接字已連線,peersock指向其連線的伺服器端套接字
	peersock = (oldstate == SS_CONNECTED) ? sock->conn : NULL;
	//轉呼叫release函式
	if (sock->ops) 
		sock->ops->release(sock, peersock);
	//釋放對端套接字
	if (peersock)
		sock_release_peer(peersock);
	--sockets_in_use;	/* 數量減1 */
	iput(SOCK_INODE(sock));
}
socket 結構
/*
 * Internal representation of a socket. not all the fields are used by
 * all configurations:
 *
 *		server			client
 * conn		client connected to	server connected to
 * iconn	list of clients		-unused-
 *		 awaiting connections
 * wait		sleep for clients,	sleep for connection,
 *		sleep for i/o		sleep for i/o
 */
 //該結構表示一個網路套接字
struct socket {
  short			type;		/* 套接字所用的流型別*/
  socket_state		state;//套接字所處狀態
  long			flags;//標識欄位,目前尚無明確作用
  struct proto_ops	*ops;		/* 操作函式集指標	*/
    /* data儲存指向‘私有'資料結構指標,在不同的域指向不同的資料結構		*/
  //在INET域,指向sock結構,UNIX域指向unix_proto_data結構
  void			*data;	
  //下面兩個欄位只用於UNIX域
  struct socket		*conn;		/* 指向客戶端連線的伺服器端套接字	*/
  struct socket		*iconn;		/* 指向正等待連線的客戶端	*/
  struct socket		*next;//連結串列
  struct wait_queue	**wait;		/* 等待佇列	*/
  struct inode		*inode;//inode結構指標
  struct fasync_struct  *fasync_list;	/* 非同步喚醒連結串列結構	*/
};

建立套接字socket,socket
/*
 *	系統呼叫,建立套接字socket。涉及到socket結構的建立.
 */

static int sock_socket(int family, int type, int protocol)
{
	int i, fd;
	struct socket *sock;
	struct proto_ops *ops;

	/* 匹配應用程式呼叫socket()函式時指定的協議 */
	for (i = 0; i < NPROTO; ++i) 
	{
		if (pops[i] == NULL) continue;
		if (pops[i]->family == family) 
			break;
	}
    //沒有匹配的協議,則出錯退出
	if (i == NPROTO) 
	{
  		return -EINVAL;
	}

	ops = pops[i];

/*
 *	Check that this is a type that we know how to manipulate and
 *	the protocol makes sense here. The family can still reject the
 *	protocol later.
 */
  //套接字型別檢查
	if ((type != SOCK_STREAM && type != SOCK_DGRAM &&
		type != SOCK_SEQPACKET && type != SOCK_RAW &&
		type != SOCK_PACKET) || protocol < 0)
			return(-EINVAL);

/*
 *	Allocate the socket and allow the family to set things up. if
 *	the protocol is 0, the family is instructed to select an appropriate
 *	default.
 */
	//分配套接字結構
	if (!(sock = sock_alloc())) 
	{
		printk("NET: sock_socket: no more sockets\n");
		return(-ENOSR);	/* Was: EAGAIN, but we are out of
				   system resources! */
	}
	//指定對應型別,協議,以及操作函式集
	sock->type = type;
	sock->ops = ops;
	//分配下層sock結構,sock結構是比socket結構更底層的表示一個套接字的結構
	//前面博文有說明:http://blog.csdn.net/wenqian1991/article/details/21740945
	//socket是通用的套接字結構體,而sock與具體使用的協議相關
	if ((i = sock->ops->create(sock, protocol)) < 0) 
	{
		sock_release(sock);
		return(i);
	}
    //分配一個檔案描述符並在後面返回給應用層序作為以後的操作控制代碼
	if ((fd = get_fd(SOCK_INODE(sock))) < 0) 
	{
		sock_release(sock);
		return(-EINVAL);
	}

	return(fd);
}
給socket繫結一個埠,bind
/*
 *	Bind a name to a socket. Nothing much to do here since it's
 *	the protocol's responsibility to handle the local address.
 *
 *	We move the socket address to kernel space before we call
 *	the protocol layer (having also checked the address is ok).
 */
 //建議對於理解這類系統呼叫函式,先看看應用層的對應函式,如bind,listen等
 //bind函式對應的BSD層函式,用於繫結一個本地地址,伺服器端
 //umyaddr表示需要繫結的地址結構,addrlen表示改地址結構的長度
static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
	struct socket *sock;
	int i;
	char address[MAX_SOCK_ADDR];
	int err;
    //套接字引數有效性檢查
	if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
		return(-EBADF);
	//獲取fd對應的socket結構
	if (!(sock = sockfd_lookup(fd, NULL))) 
		return(-ENOTSOCK);
    //將地址從使用者緩衝區複製到核心緩衝區
	if((err=move_addr_to_kernel(umyaddr,addrlen,address))<0)
	  	return err;
    //轉呼叫bind指向的函式
	if ((i = sock->ops->bind(sock, (struct sockaddr *)address, addrlen)) < 0) 
	{
		return(i);
	}
	return(0);
}
監聽客戶端請求,listen
/*
 *	Perform a listen. Basically, we allow the protocol to do anything
 *	necessary for a listen, and if that works, we mark the socket as
 *	ready for listening.
 */
	//伺服器端監聽客戶端的連線請求
//fd表示bind後的套接字,backlog表示排隊的最大連線個數
//listen函式把一個未連線的套接字轉換為一個被動套接字,
//指示核心應接受該套接字的連線請求

static int sock_listen(int fd, int backlog)
{
	struct socket *sock;

	if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
		return(-EBADF);
	if (!(sock = sockfd_lookup(fd, NULL))) 
		return(-ENOTSOCK);
    //前提是沒有建立連線
	if (sock->state != SS_UNCONNECTED) 
	{
		return(-EINVAL);
	}
	//呼叫底層實現函式
	if (sock->ops && sock->ops->listen)
		sock->ops->listen(sock, backlog);
	sock->flags |= SO_ACCEPTCON;//設定標識欄位
	return(0);
}

伺服器接收請求,accept
/*
 *	For accept, we attempt to create a new socket, set up the link
 *	with the client, wake up the client, then return the new
 *	connected fd. We collect the address of the connector in kernel
 *	space and move it to user at the very end. This is buggy because
 *	we open the socket then return an error.
 */
//用於伺服器接收一個客戶端的連線請求,這裡是值-結果引數,之前有說到
//fd 為監聽後套接字。最後返回一個記錄了本地與目的端資訊的套接字
//upeer_sockaddr用來返回已連線客戶的協議地址,如果對協議地址不感興趣就NULL
static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
	struct file *file;
	struct socket *sock, *newsock;
	int i;
	char address[MAX_SOCK_ADDR];
	int len;

	if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
		return(-EBADF);
  	if (!(sock = sockfd_lookup(fd, &file))) 
		return(-ENOTSOCK);
	if (sock->state != SS_UNCONNECTED)//socket各個狀態的演變是一步一步來的 
	{
		return(-EINVAL);
	}
	//這是tcp連線,得按步驟來
	if (!(sock->flags & SO_ACCEPTCON))//沒有listen
	{
		return(-EINVAL);
	}
	//分配一個新的套接字,用於表示後面可進行通訊的套接字
	if (!(newsock = sock_alloc())) 
	{
		printk("NET: sock_accept: no more sockets\n");
		return(-ENOSR);	/* Was: EAGAIN, but we are out of system
				   resources! */
	}
	newsock->type = sock->type;
	newsock->ops = sock->ops;
	//套接字重定向,目的是初始化新的用於資料傳送的套接字
	//繼承了第一引數傳來的伺服器的IP和埠號資訊
	if ((i = sock->ops->dup(newsock, sock)) < 0) 
	{
		sock_release(newsock);
		return(i);
	}
    //轉呼叫inet_accept函式
	i = newsock->ops->accept(sock, newsock, file->f_flags);
	if ( i < 0) 
	{
		sock_release(newsock);
		return(i);
	}
    //分配一個檔案描述符,用於以後的資料傳送
	if ((fd = get_fd(SOCK_INODE(newsock))) < 0) 
	{
		sock_release(newsock);
		return(-EINVAL);
	}
    //返回通訊遠端的地址
	if (upeer_sockaddr)
	{//得到客戶端地址,並複製到使用者空間
		newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1);
		move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen);
	}
	return(fd);
}

客戶端主動發起連線請求,connect
/*
 *	首先將要連線的源端地址從使用者緩衝區複製到核心緩衝區,之後根據套接字目前所處狀態
 *  採取對應措施,如果狀態有效,轉呼叫connect函式
 */
 //這是客戶端,表示客戶端向伺服器端傳送連線請求
static int sock_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
	struct socket *sock;
	struct file *file;
	int i;
	char address[MAX_SOCK_ADDR];
	int err;

	if (fd < 0 || fd >= NR_OPEN || (file=current->files->fd[fd]) == NULL)
		return(-EBADF);
	if (!(sock = sockfd_lookup(fd, &file)))
		return(-ENOTSOCK);

	if((err=move_addr_to_kernel(uservaddr,addrlen,address))<0)
	  	return err;
    //根據狀態採取對應措施
	switch(sock->state) 
	{
		case SS_UNCONNECTED:
			/* This is ok... continue with connect */
			break;
		case SS_CONNECTED:
			/* Socket is already connected */
			if(sock->type == SOCK_DGRAM) /* Hack for now - move this all into the protocol */
				break;
			return -EISCONN;
		case SS_CONNECTING:
			/* Not yet connected... we will check this. */
		
			/*
			 *	FIXME:  for all protocols what happens if you start
			 *	an async connect fork and both children connect. Clean
			 *	this up in the protocols!
			 */
			break;
		default:
			return(-EINVAL);
	}
	i = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, file->f_flags);
	if (i < 0) 
	{
		return(i);
	}
	return(0);
}
上面幾個函式則是我們應用程式設計是socket、bind、listen、accept、connect 函式對應的核心的系統呼叫函式,可以看出,對應的sock_ 函式內部也是轉呼叫了下一層的函式。
所有網路呼叫函式都具有共同的入口函式 sys_socket
/*
 *	System call vectors. Since I (RIB) want to rewrite sockets as streams,
 *	we have this level of indirection. Not a lot of overhead, since more of
 *	the work is done via read/write/select directly.
 *
 *	I'm now expanding this up to a higher level to separate the assorted
 *	kernel/user space manipulations and global assumptions from the protocol
 *	layers proper - AC.
 */
//本函式是網路棧專用操作函式集的總入口函式,主要是將請求分配,呼叫具體的底層函式進行處理
asmlinkage int sys_socketcall(int call, unsigned long *args)
{
	int er;
	switch(call) 
	{
		case SYS_SOCKET://socket函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_socket(get_fs_long(args+0),
				get_fs_long(args+1),//返回地址上的值
				get_fs_long(args+2)));
		case SYS_BIND://bind函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_bind(get_fs_long(args+0),
				(struct sockaddr *)get_fs_long(args+1),
				get_fs_long(args+2)));
		case SYS_CONNECT://connect函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_connect(get_fs_long(args+0),
				(struct sockaddr *)get_fs_long(args+1),
				get_fs_long(args+2)));
		case SYS_LISTEN://listen函式
			er=verify_area(VERIFY_READ, args, 2 * sizeof(long));
			if(er)
				return er;
			return(sock_listen(get_fs_long(args+0),
				get_fs_long(args+1)));
		case SYS_ACCEPT://accept函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_accept(get_fs_long(args+0),
				(struct sockaddr *)get_fs_long(args+1),
				(int *)get_fs_long(args+2)));
		case SYS_GETSOCKNAME://getsockname函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_getsockname(get_fs_long(args+0),
				(struct sockaddr *)get_fs_long(args+1),
				(int *)get_fs_long(args+2)));
		case SYS_GETPEERNAME://getpeername函式
			er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
			if(er)
				return er;
			return(sock_getpeername(get_fs_long(args+0),
				(struct sockaddr *)get_fs_long(args+1),
				(int *)get_fs_long(args+2)));
		case SYS_SOCKETPAIR://socketpair函式
			er=verify_area(VERIFY_READ, args, 4 * sizeof(long));
			if(er)
				return er;
			return(sock_socketpair(get_fs_long(args+0),
				get_fs_long(args+1),
				get_fs_long(args+2),
				(unsigned long *)get_fs_long(args+3)));
		case SYS_SEND://send函式
			er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
			if(er)
				return er;
			return(sock_send(get_fs_long(args+0),
				(void *)get_fs_long(args+1),
				get_fs_long(args+2),
				get_fs_long(args+3)));
		case SYS_SENDTO://sendto函式
			er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
			if(er)
				return er;
			return(sock_sendto(get_fs_long(args+0),
				(void *)get_fs_long(args+1),
				get_fs_long(args+2),
				get_fs_long(args+3),
				(struct sockaddr *)get_fs_long(args+4),
				get_fs_long(args+5)));
		case SYS_RECV://recv函式
			er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
			if(er)
				return er;
			return(sock_recv(get_fs_long(args+0),
				(void *)get_fs_long(args+1),
				get_fs_long(args+2),
				get_fs_long(args+3)));
		case SYS_RECVFROM://recvfrom函式
			er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
			if(er)
				return er;
			return(sock_recvfrom(get_fs_long(args+0),
				(void *)get_fs_long(args+1),
				get_fs_long(args+2),
				get_fs_long(args+3),
				(struct sockaddr *)get_fs_long(args+4),
				(int *)get_fs_long(args+5)));
		case SYS_SHUTDOWN://shutdown函式
			er=verify_area(VERIFY_READ, args, 2* sizeof(unsigned long));
			if(er)
				return er;
			return(sock_shutdown(get_fs_long(args+0),
				get_fs_long(args+1)));
		case SYS_SETSOCKOPT://setsockopt函式
			er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
			if(er)
				return er;
			return(sock_setsockopt(get_fs_long(args+0),
				get_fs_long(args+1),
				get_fs_long(args+2),
				(char *)get_fs_long(args+3),
				get_fs_long(args+4)));
		case SYS_GETSOCKOPT://getsockopt函式
			er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
			if(er)
				return er;
			return(sock_getsockopt(get_fs_long(args+0),
				get_fs_long(args+1),
				get_fs_long(args+2),
				(char *)get_fs_long(args+3),
				(int *)get_fs_long(args+4)));
		default:
			return(-EINVAL);
	}
}

下面再看看socket.c 即BSD socket層中的其餘函式
/*
 *	Sockets are not seekable.
 */

static int sock_lseek(struct inode *inode, struct file *file, off_t offset, int whence)
{
	return(-ESPIPE);
}

/*
 *	Read data from a socket. ubuf is a user mode pointer. We make sure the user
 *	area ubuf...ubuf+size-1 is writable before asking the protocol.
 */

static int sock_read(struct inode *inode, struct file *file, char *ubuf, int size)
{
	struct socket *sock;
	int err;
  
	if (!(sock = socki_lookup(inode))) 
	{
		printk("NET: sock_read: can't find socket for inode!\n");
		return(-EBADF);
	}
	if (sock->flags & SO_ACCEPTCON) 
		return(-EINVAL);

	if(size<0)
		return -EINVAL;
	if(size==0)
		return 0;
	if ((err=verify_area(VERIFY_WRITE,ubuf,size))<0)
	  	return err;
	return(sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK)));
}

/*
 *	Write data to a socket. We verify that the user area ubuf..ubuf+size-1 is
 *	readable by the user process.
 */

static int sock_write(struct inode *inode, struct file *file, char *ubuf, int size)
{
	struct socket *sock;
	int err;
	
	if (!(sock = socki_lookup(inode))) 
	{
		printk("NET: sock_write: can't find socket for inode!\n");
		return(-EBADF);
	}

	if (sock->flags & SO_ACCEPTCON) 
		return(-EINVAL);
	
	if(size<0)
		return -EINVAL;
	if(size==0)
		return 0;
		
	if ((err=verify_area(VERIFY_READ,ubuf,size))<0)
	  	return err;
	return(sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK)));
}

/*
 *	You can't read directories from a socket!
 */
 
static int sock_readdir(struct inode *inode, struct file *file, struct dirent *dirent,
	     int count)
{
	return(-EBADF);
}

/*
 *	With an ioctl arg may well be a user mode pointer, but we don't know what to do
 *	with it - thats up to the protocol still.
 */

int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
	   unsigned long arg)
{
	struct socket *sock;

	if (!(sock = socki_lookup(inode))) 
	{
		printk("NET: sock_ioctl: can't find socket for inode!\n");
		return(-EBADF);
	}
  	return(sock->ops->ioctl(sock, cmd, arg));
}


static int sock_select(struct inode *inode, struct file *file, int sel_type, select_table * wait)
{
	struct socket *sock;

	if (!(sock = socki_lookup(inode))) 
	{
		printk("NET: sock_select: can't find socket for inode!\n");
		return(0);
	}

	/*
	 *	We can't return errors to select, so it's either yes or no. 
	 */

	if (sock->ops && sock->ops->select)
		return(sock->ops->select(sock, sel_type, wait));
	return(0);
}


void sock_close(struct inode *inode, struct file *filp)
{
	struct socket *sock;

	/*
	 *	It's possible the inode is NULL if we're closing an unfinished socket. 
	 */

	if (!inode) 
		return;
//找對inode對應的socket結構
	if (!(sock = socki_lookup(inode))) 
	{
		printk("NET: sock_close: can't find socket for inode!\n");
		return;
	}
	sock_fasync(inode, filp, 0);//更新非同步通知列表
	sock_release(sock);//釋放套接字
}

/*
 *	Update the socket async list
 */
//輸入引數on的取值決定是分配還是釋放一個fasync_struct結構,該結構用於非同步喚醒
static int sock_fasync(struct inode *inode, struct file *filp, int on)
{
	struct fasync_struct *fa, *fna=NULL, **prev;
	struct socket *sock;
	unsigned long flags;
	
	if (on)//分配
	{
		fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
		if(fna==NULL)
			return -ENOMEM;
	}

	sock = socki_lookup(inode);
	
	prev=&(sock->fasync_list);
	
	save_flags(flags);//儲存當前狀態
	cli();

	//從連結串列中找到與file結構對應的fasync_struct
	for(fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
		if(fa->fa_file==filp)
			break;
	
	if(on)//分配後的建立聯絡
	{
	    //如果已經有對應的file結構,則釋放之前建立的
		if(fa!=NULL)
		{
			kfree_s(fna,sizeof(struct fasync_struct));
			restore_flags(flags);
			return 0;
		}
	    //如果沒有,則掛載這個新建立的結構
		fna->fa_file=filp;
		fna->magic=FASYNC_MAGIC;
		fna->fa_next=sock->fasync_list;
		sock->fasync_list=fna;
	}
	//釋放
	else
	{
		if(fa!=NULL)
		{
			*prev=fa->fa_next;
			kfree_s(fa,sizeof(struct fasync_struct));
		}
	}
	restore_flags(flags);//恢復狀態
	return 0;
}

/* 
 * 非同步喚醒函式,通過遍歷socket結構中fasync_list變數指向的佇列,
 * 對佇列中每個元素呼叫kill_fasync函式
 */
int sock_wake_async(struct socket *sock, int how)
{
	if (!sock || !sock->fasync_list)
		return -1;
	switch (how)
	{
		case 0:
			//kill_fasync函式即通過相應的程序傳送訊號。這就是非同步喚醒功能
			kill_fasync(sock->fasync_list, SIGIO);
			break;
		case 1:
			if (!(sock->flags & SO_WAITDATA))
				kill_fasync(sock->fasync_list, SIGIO);
			break;
		case 2:
			if (sock->flags & SO_NOSPACE)
			{
				kill_fasync(sock->fasync_list, SIGIO);
				sock->flags &= ~SO_NOSPACE;
			}
			break;
	}
	return 0;
}

	
/*
 *	只用於UNIX域名(iconn,conn只用於UNIX域),用於處理一個客戶端連線請求
 */

int sock_awaitconn(struct socket *mysock, struct socket *servsock, int flags)
{
	struct socket *last;

	/*
	 *	We must be listening
	 */
	 //檢查伺服器端是否是處於監聽狀態,即可以進行連線
	if (!(servsock->flags & SO_ACCEPTCON)) 
	{
		return(-EINVAL);
	}

  	/*
  	 *	Put ourselves on the server's incomplete connection queue. 
  	 */
  	 //將本次客戶端連線的套接字插入伺服器端,socket結構iconn欄位指向的連結串列
  	 //表示客戶端正等待連線
	mysock->next = NULL;
	cli();
	if (!(last = servsock->iconn)) 
		servsock->iconn = mysock;
	else 
	{
		while (last->next) 
			last = last->next;
		last->next = mysock;
	}
	mysock->state = SS_CONNECTING;//正在處理連線
	mysock->conn = servsock;//客戶端連線的伺服器端套接字
	sti();

	/*
	 * Wake up server, then await connection. server will set state to
	 * SS_CONNECTED if we're connected.
	 */
	 //喚醒伺服器端程序,以處理本地客戶端連線
	wake_up_interruptible(servsock->wait);
	sock_wake_async(servsock, 0);

	//檢查連線狀態
	if (mysock->state != SS_CONNECTED) 
	{
		if (flags & O_NONBLOCK)
			return -EINPROGRESS;
		//等待伺服器端處理本次連線
		interruptible_sleep_on(mysock->wait);
		
		//檢查連線狀態,如果仍然沒有建立連線
		if (mysock->state != SS_CONNECTED &&
		    mysock->state != SS_DISCONNECTING) 
		{
		/*原因如下
		 * if we're not connected we could have been
		 * 1) interrupted, so we need to remove ourselves
		 *    from the server list
		 * 2) rejected (mysock->conn == NULL), and have
		 *    already been removed from the list
		 */
		 //如果被其他中斷,需要主動將本地socket從對方伺服器中iconn中刪除
			if (mysock->conn == servsock) 
			{
				cli();
				//找到iconn中的本地socket結構
				if ((last = servsock->iconn) == mysock)
					servsock->iconn = mysock->next;
				else 
				{
					while (last->next != mysock) 
						last = last->next;
					last->next = mysock->next;
				}
				sti();
			}
			//被伺服器拒絕,本地socket已經被刪除,無需手動刪除
			return(mysock->conn ? -EINTR : -EACCES);//兩種原因情況的返回
		}
	}
	return(0);
}
其餘沒有貼出的函式,也基本上是這麼個流程。
socket.c 檔案中函式的實現絕大多數都是簡單呼叫下層函式,而這些下層函式就是af_inet.c 檔案中定義的函式。socket.c 對應 BSD socket層,檔案af_inet.c 則對應的是INET socket層。這些上下層次的表示從函式的巢狀呼叫關係上體現出來。

參考資料:《Linux 核心網路棧原始碼情景分析》、Linux kernel 1.2.13