为了体现一切皆文件的理念,套接口在创建时,即与一个文件及文件描述符绑定,此后所有对该套接口的操作都是通过文件描述符来进行的,包括专门的套接口系统调用,基于标准IO系统调用。

套接口层整体流程图

socket—proto_ops—inetsw_array等基本结构-编程知识网

1. socket结构

/***  struct socket - general BSD socket*  @state: socket state (%SS_CONNECTED, etc)*  @type: socket type (%SOCK_STREAM, etc)*  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)*  @ops: protocol specific socket operations*  @file: File back pointer for gc*  @sk: internal networking protocol agnostic socket representation*  @wq: wait queue for several uses*/
struct socket {socket_state		state;//socket接口状态,例如SS_CONNECTEDkmemcheck_bitfield_begin(type);short			type;//socket类型,例如SOCK_STREAMkmemcheck_bitfield_end(type);unsigned long		flags;//一组标志,SOCK_ASYNC_NOSPACE(发送队列是否已满)等struct socket_wq __rcu	*wq;//等待该套接字的进程队列struct file		*file;//指向与该socket相关联的file结构的指针struct sock		*sk;//socket网络层表示const struct proto_ops	*ops;//用来将套接口系统调用映射到传输层相应的协议实现的结构
};

socket_state state;     

端口状态,该标志有些状态只对TCP套接字有意义,因为只有TCP是面向连接的,有状态转换的过程。而UDP和RAW则不需要维护端口的状态。

//include/linux/net.h
typedef enum {SS_FREE = 0,			/* not allocated		*/SS_UNCONNECTED,			/* unconnected to any socket	*/SS_CONNECTING,			/* in process of connecting	*/SS_CONNECTED,			/* connected to socket		*/SS_DISCONNECTING		/* in process of disconnecting	*/
} socket_state;

short type;

套接字类型:

/*** enum sock_type - Socket types* @SOCK_STREAM: stream (connection) socket* @SOCK_DGRAM: datagram (conn.less) socket* @SOCK_RAW: raw socket* @SOCK_RDM: reliably-delivered message* @SOCK_SEQPACKET: sequential packet socket* @SOCK_DCCP: Datagram Congestion Control Protocol socket* @SOCK_PACKET: linux specific way of getting packets at the dev level.*		  For writing rarp and other similar things on the user level.** When adding some new socket type please* grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS* overrides this enum for binary compat reasons.*/
enum sock_type {SOCK_STREAM	= 1,SOCK_DGRAM	= 2,SOCK_RAW	= 3,SOCK_RDM	= 4,SOCK_SEQPACKET	= 5,SOCK_DCCP	= 6,SOCK_PACKET	= 10,
};

unsigned long flags; 

一组标志位

#define SOCK_ASYNC_NOSPACE	0
#define SOCK_ASYNC_WAITDATA	1
#define SOCK_NOSPACE		2
#define SOCK_PASSCRED		3
#define SOCK_PASSSEC		4
#define SOCK_EXTERNALLY_ALLOCATED 5

socket—proto_ops—inetsw_array等基本结构-编程知识网

socket—proto_ops—inetsw_array等基本结构-编程知识网

const struct proto_ops    *ops;

用来将套接口系统调用映射到传输层相应的协议实现。

socket—proto_ops—inetsw_array等基本结构-编程知识网

2. proto_ops结构体

是一组与套接口系统调用相对应的传输层函数指针,proto_ops是套接口系统调用到传输层函数的映射,其中某些操作函数会继续通过proto结构跳转表,进入具体的传输层或网络层的处理。

struct proto_ops {int		family;//协议族,如AF_INETstruct module	*owner;//所属模块/*与socket系统调用相对应的传输层函数指针*/int		(*release)   (struct socket *sock);int		(*bind)	     (struct socket *sock,struct sockaddr *myaddr,int sockaddr_len);int		(*connect)   (struct socket *sock,struct sockaddr *vaddr,int sockaddr_len, int flags);int		(*socketpair)(struct socket *sock1,struct socket *sock2);int		(*accept)    (struct socket *sock,struct socket *newsock, int flags);int		(*getname)   (struct socket *sock,struct sockaddr *addr,int *sockaddr_len, int peer);unsigned int	(*poll)	     (struct file *file, struct socket *sock,struct poll_table_struct *wait);int		(*ioctl)     (struct socket *sock, unsigned int cmd,unsigned long arg);
#ifdef CONFIG_COMPATint	 	(*compat_ioctl) (struct socket *sock, unsigned int cmd,unsigned long arg);
#endifint		(*listen)    (struct socket *sock, int len);int		(*shutdown)  (struct socket *sock, int flags);int		(*setsockopt)(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen);int		(*getsockopt)(struct socket *sock, int level,int optname, char __user *optval, int __user *optlen);
#ifdef CONFIG_COMPATint		(*compat_setsockopt)(struct socket *sock, int level,int optname, char __user *optval, unsigned int optlen);int		(*compat_getsockopt)(struct socket *sock, int level,int optname, char __user *optval, int __user *optlen);
#endifint		(*sendmsg)   (struct socket *sock, struct msghdr *m,size_t total_len);/* Notes for implementing recvmsg:* ===============================* msg->msg_namelen should get updated by the recvmsg handlers* iff msg_name != NULL. It is by default 0 to prevent* returning uninitialized memory to user space.  The recvfrom* handlers can assume that msg.msg_name is either NULL or has* a minimum size of sizeof(struct sockaddr_storage).*/int		(*recvmsg)   (struct socket *sock, struct msghdr *m,size_t total_len, int flags);int		(*mmap)	     (struct file *file, struct socket *sock,struct vm_area_struct * vma);ssize_t		(*sendpage)  (struct socket *sock, struct page *page,int offset, size_t size, int flags);ssize_t 	(*splice_read)(struct socket *sock,  loff_t *ppos,struct pipe_inode_info *pipe, size_t len, unsigned int flags);int		(*set_peek_off)(struct sock *sk, int val);
};

PF_IENT协议族中定义了三种proto_ops结构 


const struct proto_ops inet_stream_ops = {.family		   = PF_INET,.owner		   = THIS_MODULE,.release	   = inet_release,.bind		   = inet_bind,.connect	   = inet_stream_connect,.socketpair	   = sock_no_socketpair,.accept		   = inet_accept,.getname	   = inet_getname,.poll		   = tcp_poll,.ioctl		   = inet_ioctl,.listen		   = inet_listen,.shutdown	   = inet_shutdown,.setsockopt	   = sock_common_setsockopt,.getsockopt	   = sock_common_getsockopt,.sendmsg	   = inet_sendmsg,.recvmsg	   = inet_recvmsg,.mmap		   = sock_no_mmap,.sendpage	   = inet_sendpage,.splice_read	   = tcp_splice_read,
#ifdef CONFIG_COMPAT.compat_setsockopt = compat_sock_common_setsockopt,.compat_getsockopt = compat_sock_common_getsockopt,.compat_ioctl	   = inet_compat_ioctl,
#endif
};

const struct proto_ops inet_dgram_ops = {.family		   = PF_INET,.owner		   = THIS_MODULE,.release	   = inet_release,.bind		   = inet_bind,.connect	   = inet_dgram_connect,.socketpair	   = sock_no_socketpair,.accept		   = sock_no_accept,.getname	   = inet_getname,.poll		   = udp_poll,.ioctl		   = inet_ioctl,.listen		   = sock_no_listen,.shutdown	   = inet_shutdown,.setsockopt	   = sock_common_setsockopt,.getsockopt	   = sock_common_getsockopt,.sendmsg	   = inet_sendmsg,.recvmsg	   = inet_recvmsg,.mmap		   = sock_no_mmap,.sendpage	   = inet_sendpage,
#ifdef CONFIG_COMPAT.compat_setsockopt = compat_sock_common_setsockopt,.compat_getsockopt = compat_sock_common_getsockopt,.compat_ioctl	   = inet_compat_ioctl,
#endif
};

 


/** For SOCK_RAW sockets; should be the same as inet_dgram_ops but without* udp_poll*/
static const struct proto_ops inet_sockraw_ops = {.family		   = PF_INET,.owner		   = THIS_MODULE,.release	   = inet_release,.bind		   = inet_bind,.connect	   = inet_dgram_connect,.socketpair	   = sock_no_socketpair,.accept		   = sock_no_accept,.getname	   = inet_getname,.poll		   = datagram_poll,.ioctl		   = inet_ioctl,.listen		   = sock_no_listen,.shutdown	   = inet_shutdown,.setsockopt	   = sock_common_setsockopt,.getsockopt	   = sock_common_getsockopt,.sendmsg	   = inet_sendmsg,.recvmsg	   = inet_recvmsg,.mmap		   = sock_no_mmap,.sendpage	   = inet_sendpage,
#ifdef CONFIG_COMPAT.compat_setsockopt = compat_sock_common_setsockopt,.compat_getsockopt = compat_sock_common_getsockopt,.compat_ioctl	   = inet_compat_ioctl,
#endif
};

proto_ops结构完成的是从与协议无关的套接口层到协议相关的传输层的转换,proto结构完成的是传输层到网络层的映射。因此,传输层的每个协议都要定义一个特定的proto_ops结构和proto结构实例,在IPv4协议簇中,一个传输层协议对应一个inet_protosw结构体,inet_protosw结构体包括了proto_ops结构和proto结构。协议族中的所有inet_protosw结构实例都定义在静态数据inetsw_array中。并且,在网络子系统初始化时,根据每个结构的type成员,也就是socket类型,注册到一个全局的list_head结构体数组inetsw中,socket类型相同的inet_protosw结构体构成双向循环链表。

不知道这个sw是什么意思,socket w??

(1). inetsw[]数组

/* The inetsw table contains everything that inet_create needs to* build a new socket.*/
static struct list_head inetsw[SOCK_MAX];

(2). inetsw_array[]数组

这个数组初始化害人啦,它有一个元素struct list_head list没有初始化,搞得我以为它只有这几个元素,是说它怎么挂到inetsw[]数组中的元素中去的。

/* Upon startup we insert all the elements in inetsw_array[] into* the linked list inetsw.*/
static struct inet_protosw inetsw_array[] =
{{.type =       SOCK_STREAM,.protocol =   IPPROTO_TCP,.prot =       &tcp_prot,.ops =        &inet_stream_ops,.flags =      INET_PROTOSW_PERMANENT |INET_PROTOSW_ICSK,},{.type =       SOCK_DGRAM,.protocol =   IPPROTO_UDP,.prot =       &udp_prot,.ops =        &inet_dgram_ops,.flags =      INET_PROTOSW_PERMANENT,},{.type =       SOCK_DGRAM,.protocol =   IPPROTO_ICMP,.prot =       &ping_prot,.ops =        &inet_sockraw_ops,.flags =      INET_PROTOSW_REUSE,},{.type =       SOCK_RAW,.protocol =   IPPROTO_IP,	/* wild card */.prot =       &raw_prot,.ops =        &inet_sockraw_ops,.flags =      INET_PROTOSW_REUSE,}
};

(3). inet_protosw结构体
看一下,这个结构体中有一个struct list_head list元素,用于挂到inetsw[]中的元素上。

/* This is used to register socket interfaces for IP protocols.  */
struct inet_protosw {struct list_head list;/* These two fields form the lookup key.  */unsigned short	 type;	   /* This is the 2nd argument to socket(2). */unsigned short	 protocol; /* This is the L4 protocol number.  */struct proto	 *prot;const struct proto_ops *ops;unsigned char	 flags;      /* See INET_PROTOSW_* below.  */
};

 (4). 在inet_init函数中,对inetsw[]数组进行了初始化,

static int __init inet_init(void)
{struct inet_protosw *q;struct list_head *r;.../* Register the socket-side information for inet_create. *//*初始化链表头结点*/for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)INIT_LIST_HEAD(r);/*把type相同的inetsw_array元素,链接到对应的双向链表中。*/for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)inet_register_protosw(q);...
}

(5). inet_register_protosw()函数

在这个函数中,把inetsw_array结构体挂到对应的链表上。

void inet_register_protosw(struct inet_protosw *p)
{struct list_head *lh;struct inet_protosw *answer;int protocol = p->protocol;struct list_head *last_perm;spin_lock_bh(&inetsw_lock);if (p->type >= SOCK_MAX)goto out_illegal;/* If we are trying to override a permanent protocol, bail. */answer = NULL;last_perm = &inetsw[p->type];list_for_each(lh, &inetsw[p->type]) {answer = list_entry(lh, struct inet_protosw, list);/* Check only the non-wild match. *//*这个函数这里有点怪,不知道它想要干什么,按理说,找到对应的首节点之后,直接插入元素就可以了,怎么还干这么多幺蛾子。*/if (INET_PROTOSW_PERMANENT & answer->flags) {if (protocol == answer->protocol)break;last_perm = lh;}answer = NULL;}if (answer)goto out_permanent;/* Add the new entry after the last permanent entry if any, so that* the new entry does not override a permanent entry when matched with* a wild-card protocol. But it is allowed to override any existing* non-permanent entry.  This means that when we remove this entry, the* system automatically returns to the old behavior.*/list_add_rcu(&p->list, last_perm);
out:spin_unlock_bh(&inetsw_lock);return;out_permanent:pr_err("Attempt to override permanent protocol %d\n", protocol);goto out;out_illegal:pr_err("Ignoring attempt to register invalid socket type %d\n",p->type);goto out;
}

初始化完成后,inetsw[]和inetsw_array[]关系示意图如图所示。

socket—proto_ops—inetsw_array等基本结构-编程知识网

以第一个元素type=SOCK_STREAM,protocol=IPPROTO_TCP为例,该类型适用与tcp协议,当创建tcp socket时,其操作socket->ops赋值为&inet_stream_ops,对应的传输控制块操作sock->sk_prot赋值为&tcp_prot;

https://www.cnblogs.com/wanpengcoder/p/7623101.html

3. struct sock结构体

\linux-4.1.45\include\net\sock.h
/***	struct sock - network layer representation of sockets*	@__sk_common: shared layout with inet_timewait_sock*	@sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN*	@sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings*	@sk_lock:	synchronizer*	@sk_rcvbuf: size of receive buffer in bytes*	@sk_wq: sock wait queue and async head*	@sk_rx_dst: receive input route used by early demux*	@sk_dst_cache: destination cache*	@sk_dst_lock: destination cache lock*	@sk_policy: flow policy*	@sk_receive_queue: incoming packets*	@sk_wmem_alloc: transmit queue bytes committed*	@sk_write_queue: Packet sending queue*	@sk_omem_alloc: "o" is "option" or "other"*	@sk_wmem_queued: persistent queue size*	@sk_forward_alloc: space allocated forward*	@sk_napi_id: id of the last napi context to receive data for sk*	@sk_ll_usec: usecs to busypoll when there is no data*	@sk_allocation: allocation mode*	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)*	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)*	@sk_sndbuf: size of send buffer in bytes*	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,*		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings*	@sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets*	@sk_no_check_rx: allow zero checksum in RX packets*	@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)*	@sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)*	@sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)*	@sk_gso_max_size: Maximum GSO segment size to build*	@sk_gso_max_segs: Maximum number of GSO segments*	@sk_lingertime: %SO_LINGER l_linger setting*	@sk_backlog: always used with the per-socket spinlock held*	@sk_callback_lock: used with the callbacks in the end of this struct*	@sk_error_queue: rarely used*	@sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,*			  IPV6_ADDRFORM for instance)*	@sk_err: last error*	@sk_err_soft: errors that don't cause failure but are the cause of a*		      persistent failure not just 'timed out'*	@sk_drops: raw/udp drops counter*	@sk_ack_backlog: current listen backlog*	@sk_max_ack_backlog: listen backlog set in listen()*	@sk_priority: %SO_PRIORITY setting*	@sk_cgrp_prioidx: socket group's priority map index*	@sk_type: socket type (%SOCK_STREAM, etc)*	@sk_protocol: which protocol this socket belongs in this network family*	@sk_peer_pid: &struct pid for this socket's peer*	@sk_peer_cred: %SO_PEERCRED setting*	@sk_rcvlowat: %SO_RCVLOWAT setting*	@sk_rcvtimeo: %SO_RCVTIMEO setting*	@sk_sndtimeo: %SO_SNDTIMEO setting*	@sk_rxhash: flow hash received from netif layer*	@sk_incoming_cpu: record cpu processing incoming packets*	@sk_txhash: computed flow hash for use on transmit*	@sk_filter: socket filtering instructions*	@sk_protinfo: private area, net family specific, when not using slab*	@sk_timer: sock cleanup timer*	@sk_stamp: time stamp of last packet received*	@sk_tsflags: SO_TIMESTAMPING socket options*	@sk_tskey: counter to disambiguate concurrent tstamp requests*	@sk_socket: Identd and reporting IO signals*	@sk_user_data: RPC layer private data*	@sk_frag: cached page frag*	@sk_peek_off: current peek_offset value*	@sk_send_head: front of stuff to transmit*	@sk_security: used by security modules*	@sk_mark: generic packet mark*	@sk_classid: this socket's cgroup classid*	@sk_cgrp: this socket's cgroup-specific proto data*	@sk_write_pending: a write to stream socket waits to start*	@sk_state_change: callback to indicate change in the state of the sock*	@sk_data_ready: callback to indicate there is data to be processed*	@sk_write_space: callback to indicate there is bf sending space available*	@sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)*	@sk_backlog_rcv: callback to process the backlog*	@sk_destruct: called at sock freeing time, i.e. when all refcnt == 0*/
struct sock {/** Now struct inet_timewait_sock also uses sock_common, so please just* don't add nothing before this first member (__sk_common) --acme*/struct sock_common	__sk_common;
#define sk_node			__sk_common.skc_node
#define sk_nulls_node		__sk_common.skc_nulls_node
#define sk_refcnt		__sk_common.skc_refcnt
#define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping#define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
#define sk_dontcopy_end		__sk_common.skc_dontcopy_end
#define sk_hash			__sk_common.skc_hash
#define sk_portpair		__sk_common.skc_portpair
#define sk_num			__sk_common.skc_num
#define sk_dport		__sk_common.skc_dport
#define sk_addrpair		__sk_common.skc_addrpair
#define sk_daddr		__sk_common.skc_daddr
#define sk_rcv_saddr		__sk_common.skc_rcv_saddr
#define sk_family		__sk_common.skc_family
#define sk_state		__sk_common.skc_state
#define sk_reuse		__sk_common.skc_reuse
#define sk_reuseport		__sk_common.skc_reuseport
#define sk_ipv6only		__sk_common.skc_ipv6only
#define sk_bound_dev_if		__sk_common.skc_bound_dev_if
#define sk_bind_node		__sk_common.skc_bind_node
#define sk_prot			__sk_common.skc_prot
#define sk_net			__sk_common.skc_net
#define sk_v6_daddr		__sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
#define sk_cookie		__sk_common.skc_cookiesocket_lock_t		sk_lock;struct sk_buff_head	sk_receive_queue;/** The backlog queue is special, it is always used with* the per-socket spinlock held and requires low latency* access. Therefore we special case it's implementation.* Note : rmem_alloc is in this structure to fill a hole* on 64bit arches, not because its logically part of* backlog.*/struct {atomic_t	rmem_alloc;int		len;struct sk_buff	*head;struct sk_buff	*tail;} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_allocint			sk_forward_alloc;
#ifdef CONFIG_RPS__u32			sk_rxhash;
#endifu16			sk_incoming_cpu;/* 16bit hole* Warned : sk_incoming_cpu can be set from softirq,* Do not use this hole without fully understanding possible issues.*/__u32			sk_txhash;
#ifdef CONFIG_NET_RX_BUSY_POLLunsigned int		sk_napi_id;unsigned int		sk_ll_usec;
#endifatomic_t		sk_drops;int			sk_rcvbuf;struct sk_filter __rcu	*sk_filter;struct socket_wq __rcu	*sk_wq;#ifdef CONFIG_XFRMstruct xfrm_policy	*sk_policy[2];
#endifunsigned long 		sk_flags;struct dst_entry	*sk_rx_dst;struct dst_entry __rcu	*sk_dst_cache;spinlock_t		sk_dst_lock;atomic_t		sk_wmem_alloc;atomic_t		sk_omem_alloc;int			sk_sndbuf;struct sk_buff_head	sk_write_queue;kmemcheck_bitfield_begin(flags);unsigned int		sk_shutdown  : 2,sk_no_check_tx : 1,sk_no_check_rx : 1,sk_userlocks : 4,sk_protocol  : 8,
#define SK_PROTOCOL_MAX U8_MAXsk_type      : 16;kmemcheck_bitfield_end(flags);int			sk_wmem_queued;gfp_t			sk_allocation;u32			sk_pacing_rate; /* bytes per second */u32			sk_max_pacing_rate;netdev_features_t	sk_route_caps;netdev_features_t	sk_route_nocaps;int			sk_gso_type;unsigned int		sk_gso_max_size;u16			sk_gso_max_segs;int			sk_rcvlowat;unsigned long	        sk_lingertime;struct sk_buff_head	sk_error_queue;struct proto		*sk_prot_creator;rwlock_t		sk_callback_lock;int			sk_err,sk_err_soft;u32			sk_ack_backlog;u32			sk_max_ack_backlog;__u32			sk_priority;
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)__u32			sk_cgrp_prioidx;
#endifstruct pid		*sk_peer_pid;const struct cred	*sk_peer_cred;long			sk_rcvtimeo;long			sk_sndtimeo;void			*sk_protinfo;struct timer_list	sk_timer;ktime_t			sk_stamp;u16			sk_tsflags;u32			sk_tskey;struct socket		*sk_socket;void			*sk_user_data;struct page_frag	sk_frag;struct sk_buff		*sk_send_head;__s32			sk_peek_off;int			sk_write_pending;
#ifdef CONFIG_SECURITYvoid			*sk_security;
#endif__u32			sk_mark;u32			sk_classid;struct cg_proto		*sk_cgrp;void			(*sk_state_change)(struct sock *sk);void			(*sk_data_ready)(struct sock *sk);void			(*sk_write_space)(struct sock *sk);void			(*sk_error_report)(struct sock *sk);int			(*sk_backlog_rcv)(struct sock *sk,struct sk_buff *skb);void                    (*sk_destruct)(struct sock *sk);
};/***	struct sock_common - minimal network layer representation of sockets*	@skc_daddr: Foreign IPv4 addr*	@skc_rcv_saddr: Bound local IPv4 addr*	@skc_hash: hash value used with various protocol lookup tables*	@skc_u16hashes: two u16 hash values used by UDP lookup tables*	@skc_dport: placeholder for inet_dport/tw_dport*	@skc_num: placeholder for inet_num/tw_num*	@skc_family: network address family*	@skc_state: Connection state*	@skc_reuse: %SO_REUSEADDR setting*	@skc_reuseport: %SO_REUSEPORT setting*	@skc_bound_dev_if: bound device index if != 0*	@skc_bind_node: bind hash linkage for various protocol lookup tables*	@skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol*	@skc_prot: protocol handlers inside a network family*	@skc_net: reference to the network namespace of this socket*	@skc_node: main hash linkage for various protocol lookup tables*	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol*	@skc_tx_queue_mapping: tx queue number for this connection*	@skc_refcnt: reference count**	This is the minimal network layer representation of sockets, the header*	for struct sock and struct inet_timewait_sock.*/
struct sock_common {/* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned* address on 64bit arches : cf INET_MATCH()*/union {__addrpair	skc_addrpair;struct {__be32	skc_daddr;__be32	skc_rcv_saddr;};};union  {unsigned int	skc_hash;__u16		skc_u16hashes[2];};/* skc_dport && skc_num must be grouped as well */union {__portpair	skc_portpair;struct {__be16	skc_dport;__u16	skc_num;};};unsigned short		skc_family;volatile unsigned char	skc_state;unsigned char		skc_reuse:4;unsigned char		skc_reuseport:1;unsigned char		skc_ipv6only:1;int			skc_bound_dev_if;union {struct hlist_node	skc_bind_node;struct hlist_nulls_node skc_portaddr_node;};struct proto		*skc_prot;possible_net_t		skc_net;#if IS_ENABLED(CONFIG_IPV6)struct in6_addr		skc_v6_daddr;struct in6_addr		skc_v6_rcv_saddr;
#endifatomic64_t		skc_cookie;/** fields between dontcopy_begin/dontcopy_end* are not copied in sock_copy()*//* private: */int			skc_dontcopy_begin[0];/* public: */union {struct hlist_node	skc_node;struct hlist_nulls_node skc_nulls_node;};int			skc_tx_queue_mapping;atomic_t		skc_refcnt;/* private: */int                     skc_dontcopy_end[0];/* public: */
};

struct proto tcp_prot = {.name			= "TCP",.owner			= THIS_MODULE,.close			= tcp_close,.connect		= tcp_v4_connect,.disconnect		= tcp_disconnect,.accept			= inet_csk_accept,.ioctl			= tcp_ioctl,.init			= tcp_v4_init_sock,.destroy		= tcp_v4_destroy_sock,.shutdown		= tcp_shutdown,.setsockopt		= tcp_setsockopt,.getsockopt		= tcp_getsockopt,.recvmsg		= tcp_recvmsg,.sendmsg		= tcp_sendmsg,.sendpage		= tcp_sendpage,.backlog_rcv		= tcp_v4_do_rcv,.release_cb		= tcp_release_cb,.hash			= inet_hash,.unhash			= inet_unhash,.get_port		= inet_csk_get_port,.enter_memory_pressure	= tcp_enter_memory_pressure,.stream_memory_free	= tcp_stream_memory_free,.sockets_allocated	= &tcp_sockets_allocated,.orphan_count		= &tcp_orphan_count,.memory_allocated	= &tcp_memory_allocated,.memory_pressure	= &tcp_memory_pressure,.sysctl_mem		= sysctl_tcp_mem,.sysctl_wmem		= sysctl_tcp_wmem,.sysctl_rmem		= sysctl_tcp_rmem,.max_header		= MAX_TCP_HEADER,.obj_size		= sizeof(struct tcp_sock),.slab_flags		= SLAB_DESTROY_BY_RCU,.twsk_prot		= &tcp_timewait_sock_ops,.rsk_prot		= &tcp_request_sock_ops,.h.hashinfo		= &tcp_hashinfo,.no_autobind		= true,
#ifdef CONFIG_COMPAT.compat_setsockopt	= compat_tcp_setsockopt,.compat_getsockopt	= compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM.init_cgroup		= tcp_init_cgroup,.destroy_cgroup		= tcp_destroy_cgroup,.proto_cgroup		= tcp_proto_cgroup,
#endif
};

struct proto udp_prot = {.name		   = "UDP",.owner		   = THIS_MODULE,.close		   = udp_lib_close,.connect	   = ip4_datagram_connect,.disconnect	   = udp_disconnect,.ioctl		   = udp_ioctl,.destroy	   = udp_destroy_sock,.setsockopt	   = udp_setsockopt,.getsockopt	   = udp_getsockopt,.sendmsg	   = udp_sendmsg,.recvmsg	   = udp_recvmsg,.sendpage	   = udp_sendpage,.backlog_rcv	   = __udp_queue_rcv_skb,.release_cb	   = ip4_datagram_release_cb,.hash		   = udp_lib_hash,.unhash		   = udp_lib_unhash,.rehash		   = udp_v4_rehash,.get_port	   = udp_v4_get_port,.memory_allocated  = &udp_memory_allocated,.sysctl_mem	   = sysctl_udp_mem,.sysctl_wmem	   = &sysctl_udp_wmem_min,.sysctl_rmem	   = &sysctl_udp_rmem_min,.obj_size	   = sizeof(struct udp_sock),.slab_flags	   = SLAB_DESTROY_BY_RCU,.h.udp_table	   = &udp_table,
#ifdef CONFIG_COMPAT.compat_setsockopt = compat_udp_setsockopt,.compat_getsockopt = compat_udp_getsockopt,
#endif.clear_sk	   = sk_prot_clear_portaddr_nulls,
};

 

struct proto raw_prot = {.name =		"RAW",.owner =	THIS_MODULE,.close =	raw_close,.connect =	ip4_datagram_connect,.disconnect =	udp_disconnect,.ioctl =	raw_ioctl,.init =		raw_init,.setsockopt =	raw_setsockopt,.getsockopt =	raw_getsockopt,.sendmsg =	raw_sendmsg,.recvmsg =	raw_recvmsg,.bind =		raw_bind,.backlog_rcv =	raw_rcv_skb,.hash =		raw_v4_hash,.unhash =	raw_v4_unhash,.slab_obj_size = sizeof(struct raw_sock),
};

socket系列函数系统调用过程

        通常,应用程序中调用库函数,库函数通过系统调用进入套接口层,Linux的套接口层提供了一组专门的套接口系统调用,分别在对应的库函数名之前加上"sys_"前缀。
        在Linux内核中,每一个系统调用均被编号(系统调用号),当进程进行一个系统调用时,通过中断指令“INT 0X80”,从用户空间进入系统空间,并将系统调用号等作为参数传递给内核函数。在Linux系统中所有的系统调用都会进入系统的同一个地址,就是system_call,最终根据系统调用号,调用系统调用表sys_call_table中的某一个函数。系统中所有的socket系统调用总入口为sys_socketcall()函数。
       在Linux2.6.10中,sys_socketcall()还是一个直接的函数,在4.1.15中函数的实现涉及到宏定义的替换。

       socket()系列函数系统调用过程示意图如图所示:以sendmsg()函数为例

socket—proto_ops—inetsw_array等基本结构-编程知识网

PS:

socket—proto_ops—inetsw_array等基本结构-编程知识网

在x86-64上是没有socket_call()系统调用的,sendmsg等函数都是直接调用sys_sendmsg,不用中转。

在x86-32上以前有socket_call()系统调用,但是在linux4.3之后,也改成了直接系统调用了。

socket—proto_ops—inetsw_array等基本结构-编程知识网

1. sys_socketcall()函数

sys_socketcall()函数声明如下:

asmlinkage long sys_socketcall(int call, unsigned long __user *args);

只看到函数声明,但是找不到函数实现,是因为进行了宏替换。

 sys_socketcall()函数定义为:

/**	System call vectors.**	Argument checking cleaned up. Saved 20% in size.*  This function doesn't need to set the kernel lock because*  it is set by the callees.*/
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{unsigned long a[AUDITSC_ARGS];//a[6]unsigned long a0, a1;int err;unsigned int len;/*call值正确性判断*/if (call < 1 || call > SYS_SENDMMSG)return -EINVAL;/*用户空间传递参数长度,最大为6*sizeof(unsigned long)*/len = nargs[call];if (len > sizeof(a))return -EINVAL;/* copy_from_user should be SMP safe. *//*复制用户参数到数组a*/if (copy_from_user(a, args, len))return -EFAULT;/*??*/err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);if (err)return err;/*取出前两个参数,不明白为什么取出前两个参数,直接用a[0],a[1]不行么?*/a0 = a[0];a1 = a[1];/*根据socket系统调用号进入对应的函数进行处理。*/switch (call) {case SYS_SOCKET:err = sys_socket(a0, a1, a[2]);break;case SYS_BIND:err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);break;case SYS_CONNECT:err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);break;case SYS_LISTEN:err = sys_listen(a0, a1);break;case SYS_ACCEPT:err = sys_accept4(a0, (struct sockaddr __user *)a1,(int __user *)a[2], 0);break;case SYS_GETSOCKNAME:err =sys_getsockname(a0, (struct sockaddr __user *)a1,(int __user *)a[2]);break;case SYS_GETPEERNAME:err =sys_getpeername(a0, (struct sockaddr __user *)a1,(int __user *)a[2]);break;case SYS_SOCKETPAIR:err = sys_socketpair(a0, a1, a[2], (int __user *)a[3]);break;case SYS_SEND:err = sys_send(a0, (void __user *)a1, a[2], a[3]);break;case SYS_SENDTO:err = sys_sendto(a0, (void __user *)a1, a[2], a[3],(struct sockaddr __user *)a[4], a[5]);break;case SYS_RECV:err = sys_recv(a0, (void __user *)a1, a[2], a[3]);break;case SYS_RECVFROM:err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3],(struct sockaddr __user *)a[4],(int __user *)a[5]);break;case SYS_SHUTDOWN:err = sys_shutdown(a0, a1);break;case SYS_SETSOCKOPT:err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]);break;case SYS_GETSOCKOPT:err =sys_getsockopt(a0, a1, a[2], (char __user *)a[3],(int __user *)a[4]);break;case SYS_SENDMSG:err = sys_sendmsg(a0, (struct user_msghdr __user *)a1, a[2]);break;case SYS_SENDMMSG:err = sys_sendmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3]);break;case SYS_RECVMSG:err = sys_recvmsg(a0, (struct user_msghdr __user *)a1, a[2]);break;case SYS_RECVMMSG:err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[2], a[3],(struct timespec __user *)a[4]);break;case SYS_ACCEPT4:err = sys_accept4(a0, (struct sockaddr __user *)a1,(int __user *)a[2], a[3]);break;default:err = -EINVAL;break;}return err;
}

(1). 关于SYSCALL_DEFINE系列宏定义:

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

(2). 关于SYSCALL_DEFINEx:

#define SYSCALL_DEFINEx(x, sname, ...)				\SYSCALL_METADATA(sname, x, __VA_ARGS__)			\__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

(3). 关于__SYSCALL_DEFINEx:

#define __SYSCALL_DEFINEx(x, name, ...)					\asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\__attribute__((alias(__stringify(SyS##name))));		\static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\{								\long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\__MAP(x,__SC_TEST,__VA_ARGS__);				\__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\return ret;						\}								\static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

可以看到在__SYSCALL_DEFINEx宏中asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))就是对应的系统调用函数。 

接着sys_socketcall()函数分析,在sys_socketcall()函数中,取出用户空间传递的参数,然后根据socket系统调用号call,进入到对应的系统调用函数。

(4). 关于nargs数组:

存放了每种socket系统调用用户空间参数的长度。

​/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[21] = {AL(0), AL(3), AL(3), AL(3), AL(2), AL(3),AL(3), AL(3), AL(4), AL(4), AL(4), AL(6),AL(6), AL(2), AL(5), AL(5), AL(3), AL(3),AL(4), AL(5), AL(4)
};

(5). call值定义如下:

//include/uapi/linux/net.h
#define SYS_SOCKET	1		/* sys_socket(2)		*/
#define SYS_BIND	2		/* sys_bind(2)			*/
#define SYS_CONNECT	3		/* sys_connect(2)		*/
#define SYS_LISTEN	4		/* sys_listen(2)		*/
#define SYS_ACCEPT	5		/* sys_accept(2)		*/
#define SYS_GETSOCKNAME	6		/* sys_getsockname(2)		*/
#define SYS_GETPEERNAME	7		/* sys_getpeername(2)		*/
#define SYS_SOCKETPAIR	8		/* sys_socketpair(2)		*/
#define SYS_SEND	9		/* sys_send(2)			*/
#define SYS_RECV	10		/* sys_recv(2)			*/
#define SYS_SENDTO	11		/* sys_sendto(2)		*/
#define SYS_RECVFROM	12		/* sys_recvfrom(2)		*/
#define SYS_SHUTDOWN	13		/* sys_shutdown(2)		*/
#define SYS_SETSOCKOPT	14		/* sys_setsockopt(2)		*/
#define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
#define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
#define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
#define SYS_ACCEPT4	18		/* sys_accept4(2)		*/
#define SYS_RECVMMSG	19		/* sys_recvmmsg(2)		*/
#define SYS_SENDMMSG	20		/* sys_sendmmsg(2)		*/

2.socket()函数系统调用过程

见下一篇博客。