Back to home page

OSCL-LXR

 
 

    


0001 /* SPDX-License-Identifier: GPL-2.0 */
0002 #ifndef _RDS_RDS_H
0003 #define _RDS_RDS_H
0004 
0005 #include <net/sock.h>
0006 #include <linux/scatterlist.h>
0007 #include <linux/highmem.h>
0008 #include <rdma/rdma_cm.h>
0009 #include <linux/mutex.h>
0010 #include <linux/rds.h>
0011 #include <linux/rhashtable.h>
0012 #include <linux/refcount.h>
0013 #include <linux/in6.h>
0014 
0015 #include "info.h"
0016 
0017 /*
0018  * RDS Network protocol version
0019  */
0020 #define RDS_PROTOCOL_3_0    0x0300
0021 #define RDS_PROTOCOL_3_1    0x0301
0022 #define RDS_PROTOCOL_4_0    0x0400
0023 #define RDS_PROTOCOL_4_1    0x0401
0024 #define RDS_PROTOCOL_VERSION    RDS_PROTOCOL_3_1
0025 #define RDS_PROTOCOL_MAJOR(v)   ((v) >> 8)
0026 #define RDS_PROTOCOL_MINOR(v)   ((v) & 255)
0027 #define RDS_PROTOCOL(maj, min)  (((maj) << 8) | min)
0028 #define RDS_PROTOCOL_COMPAT_VERSION RDS_PROTOCOL_3_1
0029 
0030 /* The following ports, 16385, 18634, 18635, are registered with IANA as
0031  * the ports to be used for RDS over TCP and UDP.  Currently, only RDS over
0032  * TCP and RDS over IB/RDMA are implemented.  18634 is the historical value
0033  * used for the RDMA_CM listener port.  RDS/TCP uses port 16385.  After
0034  * IPv6 work, RDMA_CM also uses 16385 as the listener port.  18634 is kept
0035  * to ensure compatibility with older RDS modules.  Those ports are defined
0036  * in each transport's header file.
0037  */
0038 #define RDS_PORT    18634
0039 
0040 #ifdef ATOMIC64_INIT
0041 #define KERNEL_HAS_ATOMIC64
0042 #endif
0043 #ifdef RDS_DEBUG
0044 #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
0045 #else
0046 /* sigh, pr_debug() causes unused variable warnings */
0047 static inline __printf(1, 2)
0048 void rdsdebug(char *fmt, ...)
0049 {
0050 }
0051 #endif
0052 
0053 #define RDS_FRAG_SHIFT  12
0054 #define RDS_FRAG_SIZE   ((unsigned int)(1 << RDS_FRAG_SHIFT))
0055 
0056 /* Used to limit both RDMA and non-RDMA RDS message to 1MB */
0057 #define RDS_MAX_MSG_SIZE    ((unsigned int)(1 << 20))
0058 
0059 #define RDS_CONG_MAP_BYTES  (65536 / 8)
0060 #define RDS_CONG_MAP_PAGES  (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
0061 #define RDS_CONG_MAP_PAGE_BITS  (PAGE_SIZE * 8)
0062 
0063 struct rds_cong_map {
0064     struct rb_node      m_rb_node;
0065     struct in6_addr     m_addr;
0066     wait_queue_head_t   m_waitq;
0067     struct list_head    m_conn_list;
0068     unsigned long       m_page_addrs[RDS_CONG_MAP_PAGES];
0069 };
0070 
0071 
0072 /*
0073  * This is how we will track the connection state:
0074  * A connection is always in one of the following
0075  * states. Updates to the state are atomic and imply
0076  * a memory barrier.
0077  */
0078 enum {
0079     RDS_CONN_DOWN = 0,
0080     RDS_CONN_CONNECTING,
0081     RDS_CONN_DISCONNECTING,
0082     RDS_CONN_UP,
0083     RDS_CONN_RESETTING,
0084     RDS_CONN_ERROR,
0085 };
0086 
0087 /* Bits for c_flags */
0088 #define RDS_LL_SEND_FULL    0
0089 #define RDS_RECONNECT_PENDING   1
0090 #define RDS_IN_XMIT     2
0091 #define RDS_RECV_REFILL     3
0092 #define RDS_DESTROY_PENDING 4
0093 
0094 /* Max number of multipaths per RDS connection. Must be a power of 2 */
0095 #define RDS_MPATH_WORKERS   8
0096 #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
0097                    (rs)->rs_hash_initval) & ((n) - 1))
0098 
0099 #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
0100 
0101 /* Per mpath connection state */
0102 struct rds_conn_path {
0103     struct rds_connection   *cp_conn;
0104     struct rds_message  *cp_xmit_rm;
0105     unsigned long       cp_xmit_sg;
0106     unsigned int        cp_xmit_hdr_off;
0107     unsigned int        cp_xmit_data_off;
0108     unsigned int        cp_xmit_atomic_sent;
0109     unsigned int        cp_xmit_rdma_sent;
0110     unsigned int        cp_xmit_data_sent;
0111 
0112     spinlock_t      cp_lock;        /* protect msg queues */
0113     u64         cp_next_tx_seq;
0114     struct list_head    cp_send_queue;
0115     struct list_head    cp_retrans;
0116 
0117     u64         cp_next_rx_seq;
0118 
0119     void            *cp_transport_data;
0120 
0121     atomic_t        cp_state;
0122     unsigned long       cp_send_gen;
0123     unsigned long       cp_flags;
0124     unsigned long       cp_reconnect_jiffies;
0125     struct delayed_work cp_send_w;
0126     struct delayed_work cp_recv_w;
0127     struct delayed_work cp_conn_w;
0128     struct work_struct  cp_down_w;
0129     struct mutex        cp_cm_lock; /* protect cp_state & cm */
0130     wait_queue_head_t   cp_waitq;
0131 
0132     unsigned int        cp_unacked_packets;
0133     unsigned int        cp_unacked_bytes;
0134     unsigned int        cp_index;
0135 };
0136 
0137 /* One rds_connection per RDS address pair */
0138 struct rds_connection {
0139     struct hlist_node   c_hash_node;
0140     struct in6_addr     c_laddr;
0141     struct in6_addr     c_faddr;
0142     int         c_dev_if; /* ifindex used for this conn */
0143     int         c_bound_if; /* ifindex of c_laddr */
0144     unsigned int        c_loopback:1,
0145                 c_isv6:1,
0146                 c_ping_triggered:1,
0147                 c_pad_to_32:29;
0148     int         c_npaths;
0149     struct rds_connection   *c_passive;
0150     struct rds_transport    *c_trans;
0151 
0152     struct rds_cong_map *c_lcong;
0153     struct rds_cong_map *c_fcong;
0154 
0155     /* Protocol version */
0156     unsigned int        c_proposed_version;
0157     unsigned int        c_version;
0158     possible_net_t      c_net;
0159 
0160     /* TOS */
0161     u8          c_tos;
0162 
0163     struct list_head    c_map_item;
0164     unsigned long       c_map_queued;
0165 
0166     struct rds_conn_path    *c_path;
0167     wait_queue_head_t   c_hs_waitq; /* handshake waitq */
0168 
0169     u32         c_my_gen_num;
0170     u32         c_peer_gen_num;
0171 };
0172 
0173 static inline
0174 struct net *rds_conn_net(struct rds_connection *conn)
0175 {
0176     return read_pnet(&conn->c_net);
0177 }
0178 
0179 static inline
0180 void rds_conn_net_set(struct rds_connection *conn, struct net *net)
0181 {
0182     write_pnet(&conn->c_net, net);
0183 }
0184 
0185 #define RDS_FLAG_CONG_BITMAP    0x01
0186 #define RDS_FLAG_ACK_REQUIRED   0x02
0187 #define RDS_FLAG_RETRANSMITTED  0x04
0188 #define RDS_MAX_ADV_CREDIT  255
0189 
0190 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
0191  * probe to exchange control information before establishing a connection.
0192  * Currently the control information that is exchanged is the number of
0193  * supported paths. If the peer is a legacy (older kernel revision) peer,
0194  * it would return a pong message without additional control information
0195  * that would then alert the sender that the peer was an older rev.
0196  */
0197 #define RDS_FLAG_PROBE_PORT 1
0198 #define RDS_HS_PROBE(sport, dport) \
0199         ((sport == RDS_FLAG_PROBE_PORT && dport == 0) || \
0200          (sport == 0 && dport == RDS_FLAG_PROBE_PORT))
0201 /*
0202  * Maximum space available for extension headers.
0203  */
0204 #define RDS_HEADER_EXT_SPACE    16
0205 
0206 struct rds_header {
0207     __be64  h_sequence;
0208     __be64  h_ack;
0209     __be32  h_len;
0210     __be16  h_sport;
0211     __be16  h_dport;
0212     u8  h_flags;
0213     u8  h_credit;
0214     u8  h_padding[4];
0215     __sum16 h_csum;
0216 
0217     u8  h_exthdr[RDS_HEADER_EXT_SPACE];
0218 };
0219 
0220 /*
0221  * Reserved - indicates end of extensions
0222  */
0223 #define RDS_EXTHDR_NONE     0
0224 
0225 /*
0226  * This extension header is included in the very
0227  * first message that is sent on a new connection,
0228  * and identifies the protocol level. This will help
0229  * rolling updates if a future change requires breaking
0230  * the protocol.
0231  * NB: This is no longer true for IB, where we do a version
0232  * negotiation during the connection setup phase (protocol
0233  * version information is included in the RDMA CM private data).
0234  */
0235 #define RDS_EXTHDR_VERSION  1
0236 struct rds_ext_header_version {
0237     __be32          h_version;
0238 };
0239 
0240 /*
0241  * This extension header is included in the RDS message
0242  * chasing an RDMA operation.
0243  */
0244 #define RDS_EXTHDR_RDMA     2
0245 struct rds_ext_header_rdma {
0246     __be32          h_rdma_rkey;
0247 };
0248 
0249 /*
0250  * This extension header tells the peer about the
0251  * destination <R_Key,offset> of the requested RDMA
0252  * operation.
0253  */
0254 #define RDS_EXTHDR_RDMA_DEST    3
0255 struct rds_ext_header_rdma_dest {
0256     __be32          h_rdma_rkey;
0257     __be32          h_rdma_offset;
0258 };
0259 
0260 /* Extension header announcing number of paths.
0261  * Implicit length = 2 bytes.
0262  */
0263 #define RDS_EXTHDR_NPATHS   5
0264 #define RDS_EXTHDR_GEN_NUM  6
0265 
0266 #define __RDS_EXTHDR_MAX    16 /* for now */
0267 #define RDS_RX_MAX_TRACES   (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
0268 #define RDS_MSG_RX_HDR      0
0269 #define RDS_MSG_RX_START    1
0270 #define RDS_MSG_RX_END      2
0271 #define RDS_MSG_RX_CMSG     3
0272 
0273 /* The following values are whitelisted for usercopy */
0274 struct rds_inc_usercopy {
0275     rds_rdma_cookie_t   rdma_cookie;
0276     ktime_t         rx_tstamp;
0277 };
0278 
0279 struct rds_incoming {
0280     refcount_t      i_refcount;
0281     struct list_head    i_item;
0282     struct rds_connection   *i_conn;
0283     struct rds_conn_path    *i_conn_path;
0284     struct rds_header   i_hdr;
0285     unsigned long       i_rx_jiffies;
0286     struct in6_addr     i_saddr;
0287 
0288     struct rds_inc_usercopy i_usercopy;
0289     u64         i_rx_lat_trace[RDS_RX_MAX_TRACES];
0290 };
0291 
0292 struct rds_mr {
0293     struct rb_node      r_rb_node;
0294     struct kref     r_kref;
0295     u32         r_key;
0296 
0297     /* A copy of the creation flags */
0298     unsigned int        r_use_once:1;
0299     unsigned int        r_invalidate:1;
0300     unsigned int        r_write:1;
0301 
0302     struct rds_sock     *r_sock; /* back pointer to the socket that owns us */
0303     struct rds_transport    *r_trans;
0304     void            *r_trans_private;
0305 };
0306 
0307 static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
0308 {
0309     return r_key | (((u64) offset) << 32);
0310 }
0311 
0312 static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
0313 {
0314     return cookie;
0315 }
0316 
0317 static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
0318 {
0319     return cookie >> 32;
0320 }
0321 
0322 /* atomic operation types */
0323 #define RDS_ATOMIC_TYPE_CSWP        0
0324 #define RDS_ATOMIC_TYPE_FADD        1
0325 
0326 /*
0327  * m_sock_item and m_conn_item are on lists that are serialized under
0328  * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
0329  * the message will not be put back on the retransmit list after being sent.
0330  * messages that are canceled while being sent rely on this.
0331  *
0332  * m_inc is used by loopback so that it can pass an incoming message straight
0333  * back up into the rx path.  It embeds a wire header which is also used by
0334  * the send path, which is kind of awkward.
0335  *
0336  * m_sock_item indicates the message's presence on a socket's send or receive
0337  * queue.  m_rs will point to that socket.
0338  *
0339  * m_daddr is used by cancellation to prune messages to a given destination.
0340  *
0341  * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
0342  * nesting.  As paths iterate over messages on a sock, or conn, they must
0343  * also lock the conn, or sock, to remove the message from those lists too.
0344  * Testing the flag to determine if the message is still on the lists lets
0345  * us avoid testing the list_head directly.  That means each path can use
0346  * the message's list_head to keep it on a local list while juggling locks
0347  * without confusing the other path.
0348  *
0349  * m_ack_seq is an optional field set by transports who need a different
0350  * sequence number range to invalidate.  They can use this in a callback
0351  * that they pass to rds_send_drop_acked() to see if each message has been
0352  * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
0353  * had ack_seq set yet.
0354  */
0355 #define RDS_MSG_ON_SOCK     1
0356 #define RDS_MSG_ON_CONN     2
0357 #define RDS_MSG_HAS_ACK_SEQ 3
0358 #define RDS_MSG_ACK_REQUIRED    4
0359 #define RDS_MSG_RETRANSMITTED   5
0360 #define RDS_MSG_MAPPED      6
0361 #define RDS_MSG_PAGEVEC     7
0362 #define RDS_MSG_FLUSH       8
0363 
0364 struct rds_znotifier {
0365     struct mmpin        z_mmp;
0366     u32         z_cookie;
0367 };
0368 
0369 struct rds_msg_zcopy_info {
0370     struct list_head rs_zcookie_next;
0371     union {
0372         struct rds_znotifier znotif;
0373         struct rds_zcopy_cookies zcookies;
0374     };
0375 };
0376 
0377 struct rds_msg_zcopy_queue {
0378     struct list_head zcookie_head;
0379     spinlock_t lock; /* protects zcookie_head queue */
0380 };
0381 
0382 static inline void rds_message_zcopy_queue_init(struct rds_msg_zcopy_queue *q)
0383 {
0384     spin_lock_init(&q->lock);
0385     INIT_LIST_HEAD(&q->zcookie_head);
0386 }
0387 
0388 struct rds_iov_vector {
0389     struct rds_iovec *iov;
0390     int               len;
0391 };
0392 
0393 struct rds_iov_vector_arr {
0394     struct rds_iov_vector *vec;
0395     int                    len;
0396     int                    indx;
0397     int                    incr;
0398 };
0399 
0400 struct rds_message {
0401     refcount_t      m_refcount;
0402     struct list_head    m_sock_item;
0403     struct list_head    m_conn_item;
0404     struct rds_incoming m_inc;
0405     u64         m_ack_seq;
0406     struct in6_addr     m_daddr;
0407     unsigned long       m_flags;
0408 
0409     /* Never access m_rs without holding m_rs_lock.
0410      * Lock nesting is
0411      *  rm->m_rs_lock
0412      *   -> rs->rs_lock
0413      */
0414     spinlock_t      m_rs_lock;
0415     wait_queue_head_t   m_flush_wait;
0416 
0417     struct rds_sock     *m_rs;
0418 
0419     /* cookie to send to remote, in rds header */
0420     rds_rdma_cookie_t   m_rdma_cookie;
0421 
0422     unsigned int        m_used_sgs;
0423     unsigned int        m_total_sgs;
0424 
0425     void            *m_final_op;
0426 
0427     struct {
0428         struct rm_atomic_op {
0429             int         op_type;
0430             union {
0431                 struct {
0432                     uint64_t    compare;
0433                     uint64_t    swap;
0434                     uint64_t    compare_mask;
0435                     uint64_t    swap_mask;
0436                 } op_m_cswp;
0437                 struct {
0438                     uint64_t    add;
0439                     uint64_t    nocarry_mask;
0440                 } op_m_fadd;
0441             };
0442 
0443             u32         op_rkey;
0444             u64         op_remote_addr;
0445             unsigned int        op_notify:1;
0446             unsigned int        op_recverr:1;
0447             unsigned int        op_mapped:1;
0448             unsigned int        op_silent:1;
0449             unsigned int        op_active:1;
0450             struct scatterlist  *op_sg;
0451             struct rds_notifier *op_notifier;
0452 
0453             struct rds_mr       *op_rdma_mr;
0454         } atomic;
0455         struct rm_rdma_op {
0456             u32         op_rkey;
0457             u64         op_remote_addr;
0458             unsigned int        op_write:1;
0459             unsigned int        op_fence:1;
0460             unsigned int        op_notify:1;
0461             unsigned int        op_recverr:1;
0462             unsigned int        op_mapped:1;
0463             unsigned int        op_silent:1;
0464             unsigned int        op_active:1;
0465             unsigned int        op_bytes;
0466             unsigned int        op_nents;
0467             unsigned int        op_count;
0468             struct scatterlist  *op_sg;
0469             struct rds_notifier *op_notifier;
0470 
0471             struct rds_mr       *op_rdma_mr;
0472 
0473             u64         op_odp_addr;
0474             struct rds_mr       *op_odp_mr;
0475         } rdma;
0476         struct rm_data_op {
0477             unsigned int        op_active:1;
0478             unsigned int        op_nents;
0479             unsigned int        op_count;
0480             unsigned int        op_dmasg;
0481             unsigned int        op_dmaoff;
0482             struct rds_znotifier    *op_mmp_znotifier;
0483             struct scatterlist  *op_sg;
0484         } data;
0485     };
0486 
0487     struct rds_conn_path *m_conn_path;
0488 };
0489 
0490 /*
0491  * The RDS notifier is used (optionally) to tell the application about
0492  * completed RDMA operations. Rather than keeping the whole rds message
0493  * around on the queue, we allocate a small notifier that is put on the
0494  * socket's notifier_list. Notifications are delivered to the application
0495  * through control messages.
0496  */
0497 struct rds_notifier {
0498     struct list_head    n_list;
0499     uint64_t        n_user_token;
0500     int         n_status;
0501 };
0502 
0503 /* Available as part of RDS core, so doesn't need to participate
0504  * in get_preferred transport etc
0505  */
0506 #define RDS_TRANS_LOOP  3
0507 
0508 /**
0509  * struct rds_transport -  transport specific behavioural hooks
0510  *
0511  * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
0512  *        part of a message.  The caller serializes on the send_sem so this
0513  *        doesn't need to be reentrant for a given conn.  The header must be
0514  *        sent before the data payload.  .xmit must be prepared to send a
0515  *        message with no data payload.  .xmit should return the number of
0516  *        bytes that were sent down the connection, including header bytes.
0517  *        Returning 0 tells the caller that it doesn't need to perform any
0518  *        additional work now.  This is usually the case when the transport has
0519  *        filled the sending queue for its connection and will handle
0520  *        triggering the rds thread to continue the send when space becomes
0521  *        available.  Returning -EAGAIN tells the caller to retry the send
0522  *        immediately.  Returning -ENOMEM tells the caller to retry the send at
0523  *        some point in the future.
0524  *
0525  * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
0526  *                 it returns the connection can not call rds_recv_incoming().
0527  *                 This will only be called once after conn_connect returns
0528  *                 non-zero success and will The caller serializes this with
0529  *                 the send and connecting paths (xmit_* and conn_*).  The
0530  *                 transport is responsible for other serialization, including
0531  *                 rds_recv_incoming().  This is called in process context but
0532  *                 should try hard not to block.
0533  */
0534 
0535 struct rds_transport {
0536     char            t_name[TRANSNAMSIZ];
0537     struct list_head    t_item;
0538     struct module       *t_owner;
0539     unsigned int        t_prefer_loopback:1,
0540                 t_mp_capable:1;
0541     unsigned int        t_type;
0542 
0543     int (*laddr_check)(struct net *net, const struct in6_addr *addr,
0544                __u32 scope_id);
0545     int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
0546     void (*conn_free)(void *data);
0547     int (*conn_path_connect)(struct rds_conn_path *cp);
0548     void (*conn_path_shutdown)(struct rds_conn_path *conn);
0549     void (*xmit_path_prepare)(struct rds_conn_path *cp);
0550     void (*xmit_path_complete)(struct rds_conn_path *cp);
0551     int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
0552             unsigned int hdr_off, unsigned int sg, unsigned int off);
0553     int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
0554     int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op);
0555     int (*recv_path)(struct rds_conn_path *cp);
0556     int (*inc_copy_to_user)(struct rds_incoming *inc, struct iov_iter *to);
0557     void (*inc_free)(struct rds_incoming *inc);
0558 
0559     int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
0560                  struct rdma_cm_event *event, bool isv6);
0561     int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
0562     void (*cm_connect_complete)(struct rds_connection *conn,
0563                     struct rdma_cm_event *event);
0564 
0565     unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
0566                     unsigned int avail);
0567     void (*exit)(void);
0568     void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
0569             struct rds_sock *rs, u32 *key_ret,
0570             struct rds_connection *conn,
0571             u64 start, u64 length, int need_odp);
0572     void (*sync_mr)(void *trans_private, int direction);
0573     void (*free_mr)(void *trans_private, int invalidate);
0574     void (*flush_mrs)(void);
0575     bool (*t_unloading)(struct rds_connection *conn);
0576     u8 (*get_tos_map)(u8 tos);
0577 };
0578 
0579 /* Bind hash table key length.  It is the sum of the size of a struct
0580  * in6_addr, a scope_id  and a port.
0581  */
0582 #define RDS_BOUND_KEY_LEN \
0583     (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
0584 
0585 struct rds_sock {
0586     struct sock     rs_sk;
0587 
0588     u64         rs_user_addr;
0589     u64         rs_user_bytes;
0590 
0591     /*
0592      * bound_addr used for both incoming and outgoing, no INADDR_ANY
0593      * support.
0594      */
0595     struct rhash_head   rs_bound_node;
0596     u8          rs_bound_key[RDS_BOUND_KEY_LEN];
0597     struct sockaddr_in6 rs_bound_sin6;
0598 #define rs_bound_addr       rs_bound_sin6.sin6_addr
0599 #define rs_bound_addr_v4    rs_bound_sin6.sin6_addr.s6_addr32[3]
0600 #define rs_bound_port       rs_bound_sin6.sin6_port
0601 #define rs_bound_scope_id   rs_bound_sin6.sin6_scope_id
0602     struct in6_addr     rs_conn_addr;
0603 #define rs_conn_addr_v4     rs_conn_addr.s6_addr32[3]
0604     __be16          rs_conn_port;
0605     struct rds_transport    *rs_transport;
0606 
0607     /*
0608      * rds_sendmsg caches the conn it used the last time around.
0609      * This helps avoid costly lookups.
0610      */
0611     struct rds_connection   *rs_conn;
0612 
0613     /* flag indicating we were congested or not */
0614     int         rs_congested;
0615     /* seen congestion (ENOBUFS) when sending? */
0616     int         rs_seen_congestion;
0617 
0618     /* rs_lock protects all these adjacent members before the newline */
0619     spinlock_t      rs_lock;
0620     struct list_head    rs_send_queue;
0621     u32         rs_snd_bytes;
0622     int         rs_rcv_bytes;
0623     struct list_head    rs_notify_queue;    /* currently used for failed RDMAs */
0624 
0625     /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
0626      * to decide whether the application should be woken up.
0627      * If not set, we use rs_cong_track to find out whether a cong map
0628      * update arrived.
0629      */
0630     uint64_t        rs_cong_mask;
0631     uint64_t        rs_cong_notify;
0632     struct list_head    rs_cong_list;
0633     unsigned long       rs_cong_track;
0634 
0635     /*
0636      * rs_recv_lock protects the receive queue, and is
0637      * used to serialize with rds_release.
0638      */
0639     rwlock_t        rs_recv_lock;
0640     struct list_head    rs_recv_queue;
0641 
0642     /* just for stats reporting */
0643     struct list_head    rs_item;
0644 
0645     /* these have their own lock */
0646     spinlock_t      rs_rdma_lock;
0647     struct rb_root      rs_rdma_keys;
0648 
0649     /* Socket options - in case there will be more */
0650     unsigned char       rs_recverr,
0651                 rs_cong_monitor;
0652     u32         rs_hash_initval;
0653 
0654     /* Socket receive path trace points*/
0655     u8          rs_rx_traces;
0656     u8          rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX];
0657     struct rds_msg_zcopy_queue rs_zcookie_queue;
0658     u8          rs_tos;
0659 };
0660 
0661 static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
0662 {
0663     return container_of(sk, struct rds_sock, rs_sk);
0664 }
0665 static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
0666 {
0667     return &rs->rs_sk;
0668 }
0669 
0670 /*
0671  * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
0672  * to account for overhead.  We don't account for overhead, we just apply
0673  * the number of payload bytes to the specified value.
0674  */
0675 static inline int rds_sk_sndbuf(struct rds_sock *rs)
0676 {
0677     return rds_rs_to_sk(rs)->sk_sndbuf / 2;
0678 }
0679 static inline int rds_sk_rcvbuf(struct rds_sock *rs)
0680 {
0681     return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
0682 }
0683 
0684 struct rds_statistics {
0685     uint64_t    s_conn_reset;
0686     uint64_t    s_recv_drop_bad_checksum;
0687     uint64_t    s_recv_drop_old_seq;
0688     uint64_t    s_recv_drop_no_sock;
0689     uint64_t    s_recv_drop_dead_sock;
0690     uint64_t    s_recv_deliver_raced;
0691     uint64_t    s_recv_delivered;
0692     uint64_t    s_recv_queued;
0693     uint64_t    s_recv_immediate_retry;
0694     uint64_t    s_recv_delayed_retry;
0695     uint64_t    s_recv_ack_required;
0696     uint64_t    s_recv_rdma_bytes;
0697     uint64_t    s_recv_ping;
0698     uint64_t    s_send_queue_empty;
0699     uint64_t    s_send_queue_full;
0700     uint64_t    s_send_lock_contention;
0701     uint64_t    s_send_lock_queue_raced;
0702     uint64_t    s_send_immediate_retry;
0703     uint64_t    s_send_delayed_retry;
0704     uint64_t    s_send_drop_acked;
0705     uint64_t    s_send_ack_required;
0706     uint64_t    s_send_queued;
0707     uint64_t    s_send_rdma;
0708     uint64_t    s_send_rdma_bytes;
0709     uint64_t    s_send_pong;
0710     uint64_t    s_page_remainder_hit;
0711     uint64_t    s_page_remainder_miss;
0712     uint64_t    s_copy_to_user;
0713     uint64_t    s_copy_from_user;
0714     uint64_t    s_cong_update_queued;
0715     uint64_t    s_cong_update_received;
0716     uint64_t    s_cong_send_error;
0717     uint64_t    s_cong_send_blocked;
0718     uint64_t    s_recv_bytes_added_to_socket;
0719     uint64_t    s_recv_bytes_removed_from_socket;
0720     uint64_t    s_send_stuck_rm;
0721 };
0722 
0723 /* af_rds.c */
0724 void rds_sock_addref(struct rds_sock *rs);
0725 void rds_sock_put(struct rds_sock *rs);
0726 void rds_wake_sk_sleep(struct rds_sock *rs);
0727 static inline void __rds_wake_sk_sleep(struct sock *sk)
0728 {
0729     wait_queue_head_t *waitq = sk_sleep(sk);
0730 
0731     if (!sock_flag(sk, SOCK_DEAD) && waitq)
0732         wake_up(waitq);
0733 }
0734 extern wait_queue_head_t rds_poll_waitq;
0735 
0736 
0737 /* bind.c */
0738 int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
0739 void rds_remove_bound(struct rds_sock *rs);
0740 struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
0741                 __u32 scope_id);
0742 int rds_bind_lock_init(void);
0743 void rds_bind_lock_destroy(void);
0744 
0745 /* cong.c */
0746 int rds_cong_get_maps(struct rds_connection *conn);
0747 void rds_cong_add_conn(struct rds_connection *conn);
0748 void rds_cong_remove_conn(struct rds_connection *conn);
0749 void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
0750 void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
0751 int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
0752 void rds_cong_queue_updates(struct rds_cong_map *map);
0753 void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
0754 int rds_cong_updated_since(unsigned long *recent);
0755 void rds_cong_add_socket(struct rds_sock *);
0756 void rds_cong_remove_socket(struct rds_sock *);
0757 void rds_cong_exit(void);
0758 struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
0759 
0760 /* connection.c */
0761 extern u32 rds_gen_num;
0762 int rds_conn_init(void);
0763 void rds_conn_exit(void);
0764 struct rds_connection *rds_conn_create(struct net *net,
0765                        const struct in6_addr *laddr,
0766                        const struct in6_addr *faddr,
0767                        struct rds_transport *trans,
0768                        u8 tos, gfp_t gfp,
0769                        int dev_if);
0770 struct rds_connection *rds_conn_create_outgoing(struct net *net,
0771                         const struct in6_addr *laddr,
0772                         const struct in6_addr *faddr,
0773                         struct rds_transport *trans,
0774                         u8 tos, gfp_t gfp, int dev_if);
0775 void rds_conn_shutdown(struct rds_conn_path *cpath);
0776 void rds_conn_destroy(struct rds_connection *conn);
0777 void rds_conn_drop(struct rds_connection *conn);
0778 void rds_conn_path_drop(struct rds_conn_path *cpath, bool destroy);
0779 void rds_conn_connect_if_down(struct rds_connection *conn);
0780 void rds_conn_path_connect_if_down(struct rds_conn_path *cp);
0781 void rds_check_all_paths(struct rds_connection *conn);
0782 void rds_for_each_conn_info(struct socket *sock, unsigned int len,
0783               struct rds_info_iterator *iter,
0784               struct rds_info_lengths *lens,
0785               int (*visitor)(struct rds_connection *, void *),
0786               u64 *buffer,
0787               size_t item_len);
0788 
0789 __printf(2, 3)
0790 void __rds_conn_path_error(struct rds_conn_path *cp, const char *, ...);
0791 #define rds_conn_path_error(cp, fmt...) \
0792     __rds_conn_path_error(cp, KERN_WARNING "RDS: " fmt)
0793 
0794 static inline int
0795 rds_conn_path_transition(struct rds_conn_path *cp, int old, int new)
0796 {
0797     return atomic_cmpxchg(&cp->cp_state, old, new) == old;
0798 }
0799 
0800 static inline int
0801 rds_conn_transition(struct rds_connection *conn, int old, int new)
0802 {
0803     WARN_ON(conn->c_trans->t_mp_capable);
0804     return rds_conn_path_transition(&conn->c_path[0], old, new);
0805 }
0806 
0807 static inline int
0808 rds_conn_path_state(struct rds_conn_path *cp)
0809 {
0810     return atomic_read(&cp->cp_state);
0811 }
0812 
0813 static inline int
0814 rds_conn_state(struct rds_connection *conn)
0815 {
0816     WARN_ON(conn->c_trans->t_mp_capable);
0817     return rds_conn_path_state(&conn->c_path[0]);
0818 }
0819 
0820 static inline int
0821 rds_conn_path_up(struct rds_conn_path *cp)
0822 {
0823     return atomic_read(&cp->cp_state) == RDS_CONN_UP;
0824 }
0825 
0826 static inline int
0827 rds_conn_path_down(struct rds_conn_path *cp)
0828 {
0829     return atomic_read(&cp->cp_state) == RDS_CONN_DOWN;
0830 }
0831 
0832 static inline int
0833 rds_conn_up(struct rds_connection *conn)
0834 {
0835     WARN_ON(conn->c_trans->t_mp_capable);
0836     return rds_conn_path_up(&conn->c_path[0]);
0837 }
0838 
0839 static inline int
0840 rds_conn_path_connecting(struct rds_conn_path *cp)
0841 {
0842     return atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING;
0843 }
0844 
0845 static inline int
0846 rds_conn_connecting(struct rds_connection *conn)
0847 {
0848     WARN_ON(conn->c_trans->t_mp_capable);
0849     return rds_conn_path_connecting(&conn->c_path[0]);
0850 }
0851 
0852 /* message.c */
0853 struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
0854 struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents);
0855 int rds_message_copy_from_user(struct rds_message *rm, struct iov_iter *from,
0856                    bool zcopy);
0857 struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
0858 void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
0859                  __be16 dport, u64 seq);
0860 int rds_message_add_extension(struct rds_header *hdr,
0861                   unsigned int type, const void *data, unsigned int len);
0862 int rds_message_next_extension(struct rds_header *hdr,
0863                    unsigned int *pos, void *buf, unsigned int *buflen);
0864 int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
0865 int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
0866 void rds_message_inc_free(struct rds_incoming *inc);
0867 void rds_message_addref(struct rds_message *rm);
0868 void rds_message_put(struct rds_message *rm);
0869 void rds_message_wait(struct rds_message *rm);
0870 void rds_message_unmapped(struct rds_message *rm);
0871 void rds_notify_msg_zcopy_purge(struct rds_msg_zcopy_queue *info);
0872 
0873 static inline void rds_message_make_checksum(struct rds_header *hdr)
0874 {
0875     hdr->h_csum = 0;
0876     hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
0877 }
0878 
0879 static inline int rds_message_verify_checksum(const struct rds_header *hdr)
0880 {
0881     return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
0882 }
0883 
0884 
0885 /* page.c */
0886 int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
0887                  gfp_t gfp);
0888 void rds_page_exit(void);
0889 
0890 /* recv.c */
0891 void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
0892           struct in6_addr *saddr);
0893 void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
0894                struct in6_addr *saddr);
0895 void rds_inc_put(struct rds_incoming *inc);
0896 void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
0897                struct in6_addr *daddr,
0898                struct rds_incoming *inc, gfp_t gfp);
0899 int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
0900         int msg_flags);
0901 void rds_clear_recv_queue(struct rds_sock *rs);
0902 int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
0903 void rds_inc_info_copy(struct rds_incoming *inc,
0904                struct rds_info_iterator *iter,
0905                __be32 saddr, __be32 daddr, int flip);
0906 void rds6_inc_info_copy(struct rds_incoming *inc,
0907             struct rds_info_iterator *iter,
0908             struct in6_addr *saddr, struct in6_addr *daddr,
0909             int flip);
0910 
0911 /* send.c */
0912 int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
0913 void rds_send_path_reset(struct rds_conn_path *conn);
0914 int rds_send_xmit(struct rds_conn_path *cp);
0915 struct sockaddr_in;
0916 void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
0917 typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
0918 void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
0919              is_acked_func is_acked);
0920 void rds_send_path_drop_acked(struct rds_conn_path *cp, u64 ack,
0921                   is_acked_func is_acked);
0922 void rds_send_ping(struct rds_connection *conn, int cp_index);
0923 int rds_send_pong(struct rds_conn_path *cp, __be16 dport);
0924 
0925 /* rdma.c */
0926 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
0927 int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
0928 int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen);
0929 int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen);
0930 void rds_rdma_drop_keys(struct rds_sock *rs);
0931 int rds_rdma_extra_size(struct rds_rdma_args *args,
0932             struct rds_iov_vector *iov);
0933 int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
0934               struct cmsghdr *cmsg);
0935 int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
0936               struct cmsghdr *cmsg,
0937               struct rds_iov_vector *vec);
0938 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
0939               struct cmsghdr *cmsg);
0940 void rds_rdma_free_op(struct rm_rdma_op *ro);
0941 void rds_atomic_free_op(struct rm_atomic_op *ao);
0942 void rds_rdma_send_complete(struct rds_message *rm, int wc_status);
0943 void rds_atomic_send_complete(struct rds_message *rm, int wc_status);
0944 int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
0945             struct cmsghdr *cmsg);
0946 
0947 void __rds_put_mr_final(struct kref *kref);
0948 
0949 static inline bool rds_destroy_pending(struct rds_connection *conn)
0950 {
0951     return !check_net(rds_conn_net(conn)) ||
0952            (conn->c_trans->t_unloading && conn->c_trans->t_unloading(conn));
0953 }
0954 
0955 enum {
0956     ODP_NOT_NEEDED,
0957     ODP_ZEROBASED,
0958     ODP_VIRTUAL
0959 };
0960 
0961 /* stats.c */
0962 DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
0963 #define rds_stats_inc_which(which, member) do {     \
0964     per_cpu(which, get_cpu()).member++;     \
0965     put_cpu();                  \
0966 } while (0)
0967 #define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
0968 #define rds_stats_add_which(which, member, count) do {      \
0969     per_cpu(which, get_cpu()).member += count;  \
0970     put_cpu();                  \
0971 } while (0)
0972 #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
0973 int rds_stats_init(void);
0974 void rds_stats_exit(void);
0975 void rds_stats_info_copy(struct rds_info_iterator *iter,
0976              uint64_t *values, const char *const *names,
0977              size_t nr);
0978 
0979 /* sysctl.c */
0980 int rds_sysctl_init(void);
0981 void rds_sysctl_exit(void);
0982 extern unsigned long rds_sysctl_sndbuf_min;
0983 extern unsigned long rds_sysctl_sndbuf_default;
0984 extern unsigned long rds_sysctl_sndbuf_max;
0985 extern unsigned long rds_sysctl_reconnect_min_jiffies;
0986 extern unsigned long rds_sysctl_reconnect_max_jiffies;
0987 extern unsigned int  rds_sysctl_max_unacked_packets;
0988 extern unsigned int  rds_sysctl_max_unacked_bytes;
0989 extern unsigned int  rds_sysctl_ping_enable;
0990 extern unsigned long rds_sysctl_trace_flags;
0991 extern unsigned int  rds_sysctl_trace_level;
0992 
0993 /* threads.c */
0994 int rds_threads_init(void);
0995 void rds_threads_exit(void);
0996 extern struct workqueue_struct *rds_wq;
0997 void rds_queue_reconnect(struct rds_conn_path *cp);
0998 void rds_connect_worker(struct work_struct *);
0999 void rds_shutdown_worker(struct work_struct *);
1000 void rds_send_worker(struct work_struct *);
1001 void rds_recv_worker(struct work_struct *);
1002 void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
1003 void rds_connect_complete(struct rds_connection *conn);
1004 int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
1005 
1006 /* transport.c */
1007 void rds_trans_register(struct rds_transport *trans);
1008 void rds_trans_unregister(struct rds_transport *trans);
1009 struct rds_transport *rds_trans_get_preferred(struct net *net,
1010                           const struct in6_addr *addr,
1011                           __u32 scope_id);
1012 void rds_trans_put(struct rds_transport *trans);
1013 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
1014                        unsigned int avail);
1015 struct rds_transport *rds_trans_get(int t_type);
1016 int rds_trans_init(void);
1017 void rds_trans_exit(void);
1018 
1019 #endif