Back to home page

OSCL-LXR

 
 

    


0001 /*
0002  * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved.
0003  *
0004  * This software is available to you under a choice of one of two
0005  * licenses.  You may choose to be licensed under the terms of the GNU
0006  * General Public License (GPL) Version 2, available from the file
0007  * COPYING in the main directory of this source tree, or the
0008  * OpenIB.org BSD license below:
0009  *
0010  *     Redistribution and use in source and binary forms, with or
0011  *     without modification, are permitted provided that the following
0012  *     conditions are met:
0013  *
0014  *      - Redistributions of source code must retain the above
0015  *        copyright notice, this list of conditions and the following
0016  *        disclaimer.
0017  *
0018  *      - Redistributions in binary form must reproduce the above
0019  *        copyright notice, this list of conditions and the following
0020  *        disclaimer in the documentation and/or other materials
0021  *        provided with the distribution.
0022  *
0023  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
0024  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
0025  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
0026  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
0027  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
0028  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
0029  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
0030  * SOFTWARE.
0031  *
0032  */
0033 #include <linux/module.h>
0034 #include <linux/errno.h>
0035 #include <linux/kernel.h>
0036 #include <linux/gfp.h>
0037 #include <linux/in.h>
0038 #include <linux/ipv6.h>
0039 #include <linux/poll.h>
0040 #include <net/sock.h>
0041 
0042 #include "rds.h"
0043 
0044 /* this is just used for stats gathering :/ */
0045 static DEFINE_SPINLOCK(rds_sock_lock);
0046 static unsigned long rds_sock_count;
0047 static LIST_HEAD(rds_sock_list);
0048 DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
0049 
0050 /*
0051  * This is called as the final descriptor referencing this socket is closed.
0052  * We have to unbind the socket so that another socket can be bound to the
0053  * address it was using.
0054  *
0055  * We have to be careful about racing with the incoming path.  sock_orphan()
0056  * sets SOCK_DEAD and we use that as an indicator to the rx path that new
0057  * messages shouldn't be queued.
0058  */
0059 static int rds_release(struct socket *sock)
0060 {
0061     struct sock *sk = sock->sk;
0062     struct rds_sock *rs;
0063 
0064     if (!sk)
0065         goto out;
0066 
0067     rs = rds_sk_to_rs(sk);
0068 
0069     sock_orphan(sk);
0070     /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
0071      * that ensures the recv path has completed messing
0072      * with the socket. */
0073     rds_clear_recv_queue(rs);
0074     rds_cong_remove_socket(rs);
0075 
0076     rds_remove_bound(rs);
0077 
0078     rds_send_drop_to(rs, NULL);
0079     rds_rdma_drop_keys(rs);
0080     rds_notify_queue_get(rs, NULL);
0081     rds_notify_msg_zcopy_purge(&rs->rs_zcookie_queue);
0082 
0083     spin_lock_bh(&rds_sock_lock);
0084     list_del_init(&rs->rs_item);
0085     rds_sock_count--;
0086     spin_unlock_bh(&rds_sock_lock);
0087 
0088     rds_trans_put(rs->rs_transport);
0089 
0090     sock->sk = NULL;
0091     sock_put(sk);
0092 out:
0093     return 0;
0094 }
0095 
0096 /*
0097  * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
0098  * _bh() isn't OK here, we're called from interrupt handlers.  It's probably OK
0099  * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
0100  * this seems more conservative.
0101  * NB - normally, one would use sk_callback_lock for this, but we can
0102  * get here from interrupts, whereas the network code grabs sk_callback_lock
0103  * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
0104  */
0105 void rds_wake_sk_sleep(struct rds_sock *rs)
0106 {
0107     unsigned long flags;
0108 
0109     read_lock_irqsave(&rs->rs_recv_lock, flags);
0110     __rds_wake_sk_sleep(rds_rs_to_sk(rs));
0111     read_unlock_irqrestore(&rs->rs_recv_lock, flags);
0112 }
0113 
0114 static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
0115                int peer)
0116 {
0117     struct rds_sock *rs = rds_sk_to_rs(sock->sk);
0118     struct sockaddr_in6 *sin6;
0119     struct sockaddr_in *sin;
0120     int uaddr_len;
0121 
0122     /* racey, don't care */
0123     if (peer) {
0124         if (ipv6_addr_any(&rs->rs_conn_addr))
0125             return -ENOTCONN;
0126 
0127         if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
0128             sin = (struct sockaddr_in *)uaddr;
0129             memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
0130             sin->sin_family = AF_INET;
0131             sin->sin_port = rs->rs_conn_port;
0132             sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
0133             uaddr_len = sizeof(*sin);
0134         } else {
0135             sin6 = (struct sockaddr_in6 *)uaddr;
0136             sin6->sin6_family = AF_INET6;
0137             sin6->sin6_port = rs->rs_conn_port;
0138             sin6->sin6_addr = rs->rs_conn_addr;
0139             sin6->sin6_flowinfo = 0;
0140             /* scope_id is the same as in the bound address. */
0141             sin6->sin6_scope_id = rs->rs_bound_scope_id;
0142             uaddr_len = sizeof(*sin6);
0143         }
0144     } else {
0145         /* If socket is not yet bound and the socket is connected,
0146          * set the return address family to be the same as the
0147          * connected address, but with 0 address value.  If it is not
0148          * connected, set the family to be AF_UNSPEC (value 0) and
0149          * the address size to be that of an IPv4 address.
0150          */
0151         if (ipv6_addr_any(&rs->rs_bound_addr)) {
0152             if (ipv6_addr_any(&rs->rs_conn_addr)) {
0153                 sin = (struct sockaddr_in *)uaddr;
0154                 memset(sin, 0, sizeof(*sin));
0155                 sin->sin_family = AF_UNSPEC;
0156                 return sizeof(*sin);
0157             }
0158 
0159 #if IS_ENABLED(CONFIG_IPV6)
0160             if (!(ipv6_addr_type(&rs->rs_conn_addr) &
0161                   IPV6_ADDR_MAPPED)) {
0162                 sin6 = (struct sockaddr_in6 *)uaddr;
0163                 memset(sin6, 0, sizeof(*sin6));
0164                 sin6->sin6_family = AF_INET6;
0165                 return sizeof(*sin6);
0166             }
0167 #endif
0168 
0169             sin = (struct sockaddr_in *)uaddr;
0170             memset(sin, 0, sizeof(*sin));
0171             sin->sin_family = AF_INET;
0172             return sizeof(*sin);
0173         }
0174         if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
0175             sin = (struct sockaddr_in *)uaddr;
0176             memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
0177             sin->sin_family = AF_INET;
0178             sin->sin_port = rs->rs_bound_port;
0179             sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
0180             uaddr_len = sizeof(*sin);
0181         } else {
0182             sin6 = (struct sockaddr_in6 *)uaddr;
0183             sin6->sin6_family = AF_INET6;
0184             sin6->sin6_port = rs->rs_bound_port;
0185             sin6->sin6_addr = rs->rs_bound_addr;
0186             sin6->sin6_flowinfo = 0;
0187             sin6->sin6_scope_id = rs->rs_bound_scope_id;
0188             uaddr_len = sizeof(*sin6);
0189         }
0190     }
0191 
0192     return uaddr_len;
0193 }
0194 
0195 /*
0196  * RDS' poll is without a doubt the least intuitive part of the interface,
0197  * as EPOLLIN and EPOLLOUT do not behave entirely as you would expect from
0198  * a network protocol.
0199  *
0200  * EPOLLIN is asserted if
0201  *  -   there is data on the receive queue.
0202  *  -   to signal that a previously congested destination may have become
0203  *  uncongested
0204  *  -   A notification has been queued to the socket (this can be a congestion
0205  *  update, or a RDMA completion, or a MSG_ZEROCOPY completion).
0206  *
0207  * EPOLLOUT is asserted if there is room on the send queue. This does not mean
0208  * however, that the next sendmsg() call will succeed. If the application tries
0209  * to send to a congested destination, the system call may still fail (and
0210  * return ENOBUFS).
0211  */
0212 static __poll_t rds_poll(struct file *file, struct socket *sock,
0213                  poll_table *wait)
0214 {
0215     struct sock *sk = sock->sk;
0216     struct rds_sock *rs = rds_sk_to_rs(sk);
0217     __poll_t mask = 0;
0218     unsigned long flags;
0219 
0220     poll_wait(file, sk_sleep(sk), wait);
0221 
0222     if (rs->rs_seen_congestion)
0223         poll_wait(file, &rds_poll_waitq, wait);
0224 
0225     read_lock_irqsave(&rs->rs_recv_lock, flags);
0226     if (!rs->rs_cong_monitor) {
0227         /* When a congestion map was updated, we signal EPOLLIN for
0228          * "historical" reasons. Applications can also poll for
0229          * WRBAND instead. */
0230         if (rds_cong_updated_since(&rs->rs_cong_track))
0231             mask |= (EPOLLIN | EPOLLRDNORM | EPOLLWRBAND);
0232     } else {
0233         spin_lock(&rs->rs_lock);
0234         if (rs->rs_cong_notify)
0235             mask |= (EPOLLIN | EPOLLRDNORM);
0236         spin_unlock(&rs->rs_lock);
0237     }
0238     if (!list_empty(&rs->rs_recv_queue) ||
0239         !list_empty(&rs->rs_notify_queue) ||
0240         !list_empty(&rs->rs_zcookie_queue.zcookie_head))
0241         mask |= (EPOLLIN | EPOLLRDNORM);
0242     if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
0243         mask |= (EPOLLOUT | EPOLLWRNORM);
0244     if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
0245         mask |= POLLERR;
0246     read_unlock_irqrestore(&rs->rs_recv_lock, flags);
0247 
0248     /* clear state any time we wake a seen-congested socket */
0249     if (mask)
0250         rs->rs_seen_congestion = 0;
0251 
0252     return mask;
0253 }
0254 
0255 static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
0256 {
0257     struct rds_sock *rs = rds_sk_to_rs(sock->sk);
0258     rds_tos_t utos, tos = 0;
0259 
0260     switch (cmd) {
0261     case SIOCRDSSETTOS:
0262         if (get_user(utos, (rds_tos_t __user *)arg))
0263             return -EFAULT;
0264 
0265         if (rs->rs_transport &&
0266             rs->rs_transport->get_tos_map)
0267             tos = rs->rs_transport->get_tos_map(utos);
0268         else
0269             return -ENOIOCTLCMD;
0270 
0271         spin_lock_bh(&rds_sock_lock);
0272         if (rs->rs_tos || rs->rs_conn) {
0273             spin_unlock_bh(&rds_sock_lock);
0274             return -EINVAL;
0275         }
0276         rs->rs_tos = tos;
0277         spin_unlock_bh(&rds_sock_lock);
0278         break;
0279     case SIOCRDSGETTOS:
0280         spin_lock_bh(&rds_sock_lock);
0281         tos = rs->rs_tos;
0282         spin_unlock_bh(&rds_sock_lock);
0283         if (put_user(tos, (rds_tos_t __user *)arg))
0284             return -EFAULT;
0285         break;
0286     default:
0287         return -ENOIOCTLCMD;
0288     }
0289 
0290     return 0;
0291 }
0292 
0293 static int rds_cancel_sent_to(struct rds_sock *rs, sockptr_t optval, int len)
0294 {
0295     struct sockaddr_in6 sin6;
0296     struct sockaddr_in sin;
0297     int ret = 0;
0298 
0299     /* racing with another thread binding seems ok here */
0300     if (ipv6_addr_any(&rs->rs_bound_addr)) {
0301         ret = -ENOTCONN; /* XXX not a great errno */
0302         goto out;
0303     }
0304 
0305     if (len < sizeof(struct sockaddr_in)) {
0306         ret = -EINVAL;
0307         goto out;
0308     } else if (len < sizeof(struct sockaddr_in6)) {
0309         /* Assume IPv4 */
0310         if (copy_from_sockptr(&sin, optval,
0311                 sizeof(struct sockaddr_in))) {
0312             ret = -EFAULT;
0313             goto out;
0314         }
0315         ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
0316         sin6.sin6_port = sin.sin_port;
0317     } else {
0318         if (copy_from_sockptr(&sin6, optval,
0319                    sizeof(struct sockaddr_in6))) {
0320             ret = -EFAULT;
0321             goto out;
0322         }
0323     }
0324 
0325     rds_send_drop_to(rs, &sin6);
0326 out:
0327     return ret;
0328 }
0329 
0330 static int rds_set_bool_option(unsigned char *optvar, sockptr_t optval,
0331                    int optlen)
0332 {
0333     int value;
0334 
0335     if (optlen < sizeof(int))
0336         return -EINVAL;
0337     if (copy_from_sockptr(&value, optval, sizeof(int)))
0338         return -EFAULT;
0339     *optvar = !!value;
0340     return 0;
0341 }
0342 
0343 static int rds_cong_monitor(struct rds_sock *rs, sockptr_t optval, int optlen)
0344 {
0345     int ret;
0346 
0347     ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
0348     if (ret == 0) {
0349         if (rs->rs_cong_monitor) {
0350             rds_cong_add_socket(rs);
0351         } else {
0352             rds_cong_remove_socket(rs);
0353             rs->rs_cong_mask = 0;
0354             rs->rs_cong_notify = 0;
0355         }
0356     }
0357     return ret;
0358 }
0359 
0360 static int rds_set_transport(struct rds_sock *rs, sockptr_t optval, int optlen)
0361 {
0362     int t_type;
0363 
0364     if (rs->rs_transport)
0365         return -EOPNOTSUPP; /* previously attached to transport */
0366 
0367     if (optlen != sizeof(int))
0368         return -EINVAL;
0369 
0370     if (copy_from_sockptr(&t_type, optval, sizeof(t_type)))
0371         return -EFAULT;
0372 
0373     if (t_type < 0 || t_type >= RDS_TRANS_COUNT)
0374         return -EINVAL;
0375 
0376     rs->rs_transport = rds_trans_get(t_type);
0377 
0378     return rs->rs_transport ? 0 : -ENOPROTOOPT;
0379 }
0380 
0381 static int rds_enable_recvtstamp(struct sock *sk, sockptr_t optval,
0382                  int optlen, int optname)
0383 {
0384     int val, valbool;
0385 
0386     if (optlen != sizeof(int))
0387         return -EFAULT;
0388 
0389     if (copy_from_sockptr(&val, optval, sizeof(int)))
0390         return -EFAULT;
0391 
0392     valbool = val ? 1 : 0;
0393 
0394     if (optname == SO_TIMESTAMP_NEW)
0395         sock_set_flag(sk, SOCK_TSTAMP_NEW);
0396 
0397     if (valbool)
0398         sock_set_flag(sk, SOCK_RCVTSTAMP);
0399     else
0400         sock_reset_flag(sk, SOCK_RCVTSTAMP);
0401 
0402     return 0;
0403 }
0404 
0405 static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
0406                   int optlen)
0407 {
0408     struct rds_rx_trace_so trace;
0409     int i;
0410 
0411     if (optlen != sizeof(struct rds_rx_trace_so))
0412         return -EFAULT;
0413 
0414     if (copy_from_sockptr(&trace, optval, sizeof(trace)))
0415         return -EFAULT;
0416 
0417     if (trace.rx_traces > RDS_MSG_RX_DGRAM_TRACE_MAX)
0418         return -EFAULT;
0419 
0420     rs->rs_rx_traces = trace.rx_traces;
0421     for (i = 0; i < rs->rs_rx_traces; i++) {
0422         if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
0423             rs->rs_rx_traces = 0;
0424             return -EFAULT;
0425         }
0426         rs->rs_rx_trace[i] = trace.rx_trace_pos[i];
0427     }
0428 
0429     return 0;
0430 }
0431 
0432 static int rds_setsockopt(struct socket *sock, int level, int optname,
0433               sockptr_t optval, unsigned int optlen)
0434 {
0435     struct rds_sock *rs = rds_sk_to_rs(sock->sk);
0436     int ret;
0437 
0438     if (level != SOL_RDS) {
0439         ret = -ENOPROTOOPT;
0440         goto out;
0441     }
0442 
0443     switch (optname) {
0444     case RDS_CANCEL_SENT_TO:
0445         ret = rds_cancel_sent_to(rs, optval, optlen);
0446         break;
0447     case RDS_GET_MR:
0448         ret = rds_get_mr(rs, optval, optlen);
0449         break;
0450     case RDS_GET_MR_FOR_DEST:
0451         ret = rds_get_mr_for_dest(rs, optval, optlen);
0452         break;
0453     case RDS_FREE_MR:
0454         ret = rds_free_mr(rs, optval, optlen);
0455         break;
0456     case RDS_RECVERR:
0457         ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
0458         break;
0459     case RDS_CONG_MONITOR:
0460         ret = rds_cong_monitor(rs, optval, optlen);
0461         break;
0462     case SO_RDS_TRANSPORT:
0463         lock_sock(sock->sk);
0464         ret = rds_set_transport(rs, optval, optlen);
0465         release_sock(sock->sk);
0466         break;
0467     case SO_TIMESTAMP_OLD:
0468     case SO_TIMESTAMP_NEW:
0469         lock_sock(sock->sk);
0470         ret = rds_enable_recvtstamp(sock->sk, optval, optlen, optname);
0471         release_sock(sock->sk);
0472         break;
0473     case SO_RDS_MSG_RXPATH_LATENCY:
0474         ret = rds_recv_track_latency(rs, optval, optlen);
0475         break;
0476     default:
0477         ret = -ENOPROTOOPT;
0478     }
0479 out:
0480     return ret;
0481 }
0482 
0483 static int rds_getsockopt(struct socket *sock, int level, int optname,
0484               char __user *optval, int __user *optlen)
0485 {
0486     struct rds_sock *rs = rds_sk_to_rs(sock->sk);
0487     int ret = -ENOPROTOOPT, len;
0488     int trans;
0489 
0490     if (level != SOL_RDS)
0491         goto out;
0492 
0493     if (get_user(len, optlen)) {
0494         ret = -EFAULT;
0495         goto out;
0496     }
0497 
0498     switch (optname) {
0499     case RDS_INFO_FIRST ... RDS_INFO_LAST:
0500         ret = rds_info_getsockopt(sock, optname, optval,
0501                       optlen);
0502         break;
0503 
0504     case RDS_RECVERR:
0505         if (len < sizeof(int))
0506             ret = -EINVAL;
0507         else
0508         if (put_user(rs->rs_recverr, (int __user *) optval) ||
0509             put_user(sizeof(int), optlen))
0510             ret = -EFAULT;
0511         else
0512             ret = 0;
0513         break;
0514     case SO_RDS_TRANSPORT:
0515         if (len < sizeof(int)) {
0516             ret = -EINVAL;
0517             break;
0518         }
0519         trans = (rs->rs_transport ? rs->rs_transport->t_type :
0520              RDS_TRANS_NONE); /* unbound */
0521         if (put_user(trans, (int __user *)optval) ||
0522             put_user(sizeof(int), optlen))
0523             ret = -EFAULT;
0524         else
0525             ret = 0;
0526         break;
0527     default:
0528         break;
0529     }
0530 
0531 out:
0532     return ret;
0533 
0534 }
0535 
0536 static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
0537                int addr_len, int flags)
0538 {
0539     struct sock *sk = sock->sk;
0540     struct sockaddr_in *sin;
0541     struct rds_sock *rs = rds_sk_to_rs(sk);
0542     int ret = 0;
0543 
0544     if (addr_len < offsetofend(struct sockaddr, sa_family))
0545         return -EINVAL;
0546 
0547     lock_sock(sk);
0548 
0549     switch (uaddr->sa_family) {
0550     case AF_INET:
0551         sin = (struct sockaddr_in *)uaddr;
0552         if (addr_len < sizeof(struct sockaddr_in)) {
0553             ret = -EINVAL;
0554             break;
0555         }
0556         if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
0557             ret = -EDESTADDRREQ;
0558             break;
0559         }
0560         if (ipv4_is_multicast(sin->sin_addr.s_addr) ||
0561             sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
0562             ret = -EINVAL;
0563             break;
0564         }
0565         ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
0566         rs->rs_conn_port = sin->sin_port;
0567         break;
0568 
0569 #if IS_ENABLED(CONFIG_IPV6)
0570     case AF_INET6: {
0571         struct sockaddr_in6 *sin6;
0572         int addr_type;
0573 
0574         sin6 = (struct sockaddr_in6 *)uaddr;
0575         if (addr_len < sizeof(struct sockaddr_in6)) {
0576             ret = -EINVAL;
0577             break;
0578         }
0579         addr_type = ipv6_addr_type(&sin6->sin6_addr);
0580         if (!(addr_type & IPV6_ADDR_UNICAST)) {
0581             __be32 addr4;
0582 
0583             if (!(addr_type & IPV6_ADDR_MAPPED)) {
0584                 ret = -EPROTOTYPE;
0585                 break;
0586             }
0587 
0588             /* It is a mapped address.  Need to do some sanity
0589              * checks.
0590              */
0591             addr4 = sin6->sin6_addr.s6_addr32[3];
0592             if (addr4 == htonl(INADDR_ANY) ||
0593                 addr4 == htonl(INADDR_BROADCAST) ||
0594                 ipv4_is_multicast(addr4)) {
0595                 ret = -EPROTOTYPE;
0596                 break;
0597             }
0598         }
0599 
0600         if (addr_type & IPV6_ADDR_LINKLOCAL) {
0601             /* If socket is arleady bound to a link local address,
0602              * the peer address must be on the same link.
0603              */
0604             if (sin6->sin6_scope_id == 0 ||
0605                 (!ipv6_addr_any(&rs->rs_bound_addr) &&
0606                  rs->rs_bound_scope_id &&
0607                  sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
0608                 ret = -EINVAL;
0609                 break;
0610             }
0611             /* Remember the connected address scope ID.  It will
0612              * be checked against the binding local address when
0613              * the socket is bound.
0614              */
0615             rs->rs_bound_scope_id = sin6->sin6_scope_id;
0616         }
0617         rs->rs_conn_addr = sin6->sin6_addr;
0618         rs->rs_conn_port = sin6->sin6_port;
0619         break;
0620     }
0621 #endif
0622 
0623     default:
0624         ret = -EAFNOSUPPORT;
0625         break;
0626     }
0627 
0628     release_sock(sk);
0629     return ret;
0630 }
0631 
0632 static struct proto rds_proto = {
0633     .name     = "RDS",
0634     .owner    = THIS_MODULE,
0635     .obj_size = sizeof(struct rds_sock),
0636 };
0637 
0638 static const struct proto_ops rds_proto_ops = {
0639     .family =   AF_RDS,
0640     .owner =    THIS_MODULE,
0641     .release =  rds_release,
0642     .bind =     rds_bind,
0643     .connect =  rds_connect,
0644     .socketpair =   sock_no_socketpair,
0645     .accept =   sock_no_accept,
0646     .getname =  rds_getname,
0647     .poll =     rds_poll,
0648     .ioctl =    rds_ioctl,
0649     .listen =   sock_no_listen,
0650     .shutdown = sock_no_shutdown,
0651     .setsockopt =   rds_setsockopt,
0652     .getsockopt =   rds_getsockopt,
0653     .sendmsg =  rds_sendmsg,
0654     .recvmsg =  rds_recvmsg,
0655     .mmap =     sock_no_mmap,
0656     .sendpage = sock_no_sendpage,
0657 };
0658 
0659 static void rds_sock_destruct(struct sock *sk)
0660 {
0661     struct rds_sock *rs = rds_sk_to_rs(sk);
0662 
0663     WARN_ON((&rs->rs_item != rs->rs_item.next ||
0664          &rs->rs_item != rs->rs_item.prev));
0665 }
0666 
0667 static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
0668 {
0669     struct rds_sock *rs;
0670 
0671     sock_init_data(sock, sk);
0672     sock->ops       = &rds_proto_ops;
0673     sk->sk_protocol     = protocol;
0674     sk->sk_destruct     = rds_sock_destruct;
0675 
0676     rs = rds_sk_to_rs(sk);
0677     spin_lock_init(&rs->rs_lock);
0678     rwlock_init(&rs->rs_recv_lock);
0679     INIT_LIST_HEAD(&rs->rs_send_queue);
0680     INIT_LIST_HEAD(&rs->rs_recv_queue);
0681     INIT_LIST_HEAD(&rs->rs_notify_queue);
0682     INIT_LIST_HEAD(&rs->rs_cong_list);
0683     rds_message_zcopy_queue_init(&rs->rs_zcookie_queue);
0684     spin_lock_init(&rs->rs_rdma_lock);
0685     rs->rs_rdma_keys = RB_ROOT;
0686     rs->rs_rx_traces = 0;
0687     rs->rs_tos = 0;
0688     rs->rs_conn = NULL;
0689 
0690     spin_lock_bh(&rds_sock_lock);
0691     list_add_tail(&rs->rs_item, &rds_sock_list);
0692     rds_sock_count++;
0693     spin_unlock_bh(&rds_sock_lock);
0694 
0695     return 0;
0696 }
0697 
0698 static int rds_create(struct net *net, struct socket *sock, int protocol,
0699               int kern)
0700 {
0701     struct sock *sk;
0702 
0703     if (sock->type != SOCK_SEQPACKET || protocol)
0704         return -ESOCKTNOSUPPORT;
0705 
0706     sk = sk_alloc(net, AF_RDS, GFP_KERNEL, &rds_proto, kern);
0707     if (!sk)
0708         return -ENOMEM;
0709 
0710     return __rds_create(sock, sk, protocol);
0711 }
0712 
0713 void rds_sock_addref(struct rds_sock *rs)
0714 {
0715     sock_hold(rds_rs_to_sk(rs));
0716 }
0717 
0718 void rds_sock_put(struct rds_sock *rs)
0719 {
0720     sock_put(rds_rs_to_sk(rs));
0721 }
0722 
0723 static const struct net_proto_family rds_family_ops = {
0724     .family =   AF_RDS,
0725     .create =   rds_create,
0726     .owner  =   THIS_MODULE,
0727 };
0728 
0729 static void rds_sock_inc_info(struct socket *sock, unsigned int len,
0730                   struct rds_info_iterator *iter,
0731                   struct rds_info_lengths *lens)
0732 {
0733     struct rds_sock *rs;
0734     struct rds_incoming *inc;
0735     unsigned int total = 0;
0736 
0737     len /= sizeof(struct rds_info_message);
0738 
0739     spin_lock_bh(&rds_sock_lock);
0740 
0741     list_for_each_entry(rs, &rds_sock_list, rs_item) {
0742         /* This option only supports IPv4 sockets. */
0743         if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
0744             continue;
0745 
0746         read_lock(&rs->rs_recv_lock);
0747 
0748         /* XXX too lazy to maintain counts.. */
0749         list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
0750             total++;
0751             if (total <= len)
0752                 rds_inc_info_copy(inc, iter,
0753                           inc->i_saddr.s6_addr32[3],
0754                           rs->rs_bound_addr_v4,
0755                           1);
0756         }
0757 
0758         read_unlock(&rs->rs_recv_lock);
0759     }
0760 
0761     spin_unlock_bh(&rds_sock_lock);
0762 
0763     lens->nr = total;
0764     lens->each = sizeof(struct rds_info_message);
0765 }
0766 
0767 #if IS_ENABLED(CONFIG_IPV6)
0768 static void rds6_sock_inc_info(struct socket *sock, unsigned int len,
0769                    struct rds_info_iterator *iter,
0770                    struct rds_info_lengths *lens)
0771 {
0772     struct rds_incoming *inc;
0773     unsigned int total = 0;
0774     struct rds_sock *rs;
0775 
0776     len /= sizeof(struct rds6_info_message);
0777 
0778     spin_lock_bh(&rds_sock_lock);
0779 
0780     list_for_each_entry(rs, &rds_sock_list, rs_item) {
0781         read_lock(&rs->rs_recv_lock);
0782 
0783         list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
0784             total++;
0785             if (total <= len)
0786                 rds6_inc_info_copy(inc, iter, &inc->i_saddr,
0787                            &rs->rs_bound_addr, 1);
0788         }
0789 
0790         read_unlock(&rs->rs_recv_lock);
0791     }
0792 
0793     spin_unlock_bh(&rds_sock_lock);
0794 
0795     lens->nr = total;
0796     lens->each = sizeof(struct rds6_info_message);
0797 }
0798 #endif
0799 
0800 static void rds_sock_info(struct socket *sock, unsigned int len,
0801               struct rds_info_iterator *iter,
0802               struct rds_info_lengths *lens)
0803 {
0804     struct rds_info_socket sinfo;
0805     unsigned int cnt = 0;
0806     struct rds_sock *rs;
0807 
0808     len /= sizeof(struct rds_info_socket);
0809 
0810     spin_lock_bh(&rds_sock_lock);
0811 
0812     if (len < rds_sock_count) {
0813         cnt = rds_sock_count;
0814         goto out;
0815     }
0816 
0817     list_for_each_entry(rs, &rds_sock_list, rs_item) {
0818         /* This option only supports IPv4 sockets. */
0819         if (!ipv6_addr_v4mapped(&rs->rs_bound_addr))
0820             continue;
0821         sinfo.sndbuf = rds_sk_sndbuf(rs);
0822         sinfo.rcvbuf = rds_sk_rcvbuf(rs);
0823         sinfo.bound_addr = rs->rs_bound_addr_v4;
0824         sinfo.connected_addr = rs->rs_conn_addr_v4;
0825         sinfo.bound_port = rs->rs_bound_port;
0826         sinfo.connected_port = rs->rs_conn_port;
0827         sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
0828 
0829         rds_info_copy(iter, &sinfo, sizeof(sinfo));
0830         cnt++;
0831     }
0832 
0833 out:
0834     lens->nr = cnt;
0835     lens->each = sizeof(struct rds_info_socket);
0836 
0837     spin_unlock_bh(&rds_sock_lock);
0838 }
0839 
0840 #if IS_ENABLED(CONFIG_IPV6)
0841 static void rds6_sock_info(struct socket *sock, unsigned int len,
0842                struct rds_info_iterator *iter,
0843                struct rds_info_lengths *lens)
0844 {
0845     struct rds6_info_socket sinfo6;
0846     struct rds_sock *rs;
0847 
0848     len /= sizeof(struct rds6_info_socket);
0849 
0850     spin_lock_bh(&rds_sock_lock);
0851 
0852     if (len < rds_sock_count)
0853         goto out;
0854 
0855     list_for_each_entry(rs, &rds_sock_list, rs_item) {
0856         sinfo6.sndbuf = rds_sk_sndbuf(rs);
0857         sinfo6.rcvbuf = rds_sk_rcvbuf(rs);
0858         sinfo6.bound_addr = rs->rs_bound_addr;
0859         sinfo6.connected_addr = rs->rs_conn_addr;
0860         sinfo6.bound_port = rs->rs_bound_port;
0861         sinfo6.connected_port = rs->rs_conn_port;
0862         sinfo6.inum = sock_i_ino(rds_rs_to_sk(rs));
0863 
0864         rds_info_copy(iter, &sinfo6, sizeof(sinfo6));
0865     }
0866 
0867  out:
0868     lens->nr = rds_sock_count;
0869     lens->each = sizeof(struct rds6_info_socket);
0870 
0871     spin_unlock_bh(&rds_sock_lock);
0872 }
0873 #endif
0874 
0875 static void rds_exit(void)
0876 {
0877     sock_unregister(rds_family_ops.family);
0878     proto_unregister(&rds_proto);
0879     rds_conn_exit();
0880     rds_cong_exit();
0881     rds_sysctl_exit();
0882     rds_threads_exit();
0883     rds_stats_exit();
0884     rds_page_exit();
0885     rds_bind_lock_destroy();
0886     rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
0887     rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
0888 #if IS_ENABLED(CONFIG_IPV6)
0889     rds_info_deregister_func(RDS6_INFO_SOCKETS, rds6_sock_info);
0890     rds_info_deregister_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
0891 #endif
0892 }
0893 module_exit(rds_exit);
0894 
0895 u32 rds_gen_num;
0896 
0897 static int rds_init(void)
0898 {
0899     int ret;
0900 
0901     net_get_random_once(&rds_gen_num, sizeof(rds_gen_num));
0902 
0903     ret = rds_bind_lock_init();
0904     if (ret)
0905         goto out;
0906 
0907     ret = rds_conn_init();
0908     if (ret)
0909         goto out_bind;
0910 
0911     ret = rds_threads_init();
0912     if (ret)
0913         goto out_conn;
0914     ret = rds_sysctl_init();
0915     if (ret)
0916         goto out_threads;
0917     ret = rds_stats_init();
0918     if (ret)
0919         goto out_sysctl;
0920     ret = proto_register(&rds_proto, 1);
0921     if (ret)
0922         goto out_stats;
0923     ret = sock_register(&rds_family_ops);
0924     if (ret)
0925         goto out_proto;
0926 
0927     rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
0928     rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
0929 #if IS_ENABLED(CONFIG_IPV6)
0930     rds_info_register_func(RDS6_INFO_SOCKETS, rds6_sock_info);
0931     rds_info_register_func(RDS6_INFO_RECV_MESSAGES, rds6_sock_inc_info);
0932 #endif
0933 
0934     goto out;
0935 
0936 out_proto:
0937     proto_unregister(&rds_proto);
0938 out_stats:
0939     rds_stats_exit();
0940 out_sysctl:
0941     rds_sysctl_exit();
0942 out_threads:
0943     rds_threads_exit();
0944 out_conn:
0945     rds_conn_exit();
0946     rds_cong_exit();
0947     rds_page_exit();
0948 out_bind:
0949     rds_bind_lock_destroy();
0950 out:
0951     return ret;
0952 }
0953 module_init(rds_init);
0954 
0955 #define DRV_VERSION     "4.0"
0956 #define DRV_RELDATE     "Feb 12, 2009"
0957 
0958 MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
0959 MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
0960            " v" DRV_VERSION " (" DRV_RELDATE ")");
0961 MODULE_VERSION(DRV_VERSION);
0962 MODULE_LICENSE("Dual BSD/GPL");
0963 MODULE_ALIAS_NETPROTO(PF_RDS);